1 : // Copyright 2012 Google Inc. All Rights Reserved.
2 : //
3 : // Licensed under the Apache License, Version 2.0 (the "License");
4 : // you may not use this file except in compliance with the License.
5 : // You may obtain a copy of the License at
6 : //
7 : // http://www.apache.org/licenses/LICENSE-2.0
8 : //
9 : // Unless required by applicable law or agreed to in writing, software
10 : // distributed under the License is distributed on an "AS IS" BASIS,
11 : // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 : // See the License for the specific language governing permissions and
13 : // limitations under the License.
14 : //
15 : // A class that attempts to disassemble a function into basic blocks.
16 : //
17 : // Given a function block (dubbed macro block), this disassembler attempts to
18 : // cut it up into sequences of contiguous instruction runs and data blocks. A
19 : // contiguous instruction run is defined as a set of instructions that under
20 : // normal operation will always run from start to end. This class requires that
21 : // all external references to addresses within a function block have an
22 : // associated label.
23 :
24 : #ifndef SYZYGY_BLOCK_GRAPH_BASIC_BLOCK_DECOMPOSER_H_
25 : #define SYZYGY_BLOCK_GRAPH_BASIC_BLOCK_DECOMPOSER_H_
26 :
27 : #include <set>
28 : #include <string>
29 :
30 : #include "base/basictypes.h"
31 : #include "base/callback.h"
32 : #include "base/string_piece.h"
33 : #include "syzygy/block_graph/basic_block.h"
34 : #include "syzygy/block_graph/basic_block_subgraph.h"
35 : #include "syzygy/block_graph/block_graph.h"
36 : #include "syzygy/core/address.h"
37 : #include "syzygy/core/disassembler.h"
38 : #include "distorm.h" // NOLINT
39 :
40 m : namespace block_graph {
41 :
42 : // This class re-disassembles an already-processed code block (referred to
43 : // herein as a macro block) and breaks it up into basic blocks.
44 : //
45 : // A basic block is defined here as one of:
46 : //
47 : // 1) A series of code instructions that will be executed contiguously.
48 : // 2) A chunk of data (as determined by it being labeled as such).
49 : // 3) Padding (or unreachable code)
50 : //
51 : // The break-down into basic blocks happens in six passes:
52 : //
53 : // 1) Code disassembly starting from the block's inbound code references. This
54 : // carves all of the basic code blocks and creates control flow (successor)
55 : // relationships. While the basic blocks are being created, intra-block
56 : // successors cannot be resolved and are instead referenced by block offset;
57 : // inter-block successors are immediately resolved.
58 : // 2) Data block construction to carve out statically embedded data.
59 : // 3) Padding block construction to fill any gaps.
60 : // 4) Copying all inbound references (referrers) to their corresponding
61 : // basic block.
62 : // 5) Copying all references originating in the block to their corresponding
63 : // basic block.
64 : // 6) Wiring up all unresolved intra-block successors.
65 : //
66 : // The block to be decomposed must have been produced by a successful
67 : // PE decomposition (or some facsimile thereof).
68 m : class BasicBlockDecomposer {
69 m : public:
70 m : typedef core::AbsoluteAddress AbsoluteAddress;
71 m : typedef block_graph::BasicBlock BasicBlock;
72 m : typedef BasicBlock::BasicBlockType BasicBlockType;
73 m : typedef block_graph::BlockGraph BlockGraph;
74 m : typedef BlockGraph::Offset Offset;
75 m : typedef block_graph::BasicBlockSubGraph BasicBlockSubGraph;
76 :
77 : // Initialize the BasicBlockDecomposer instance.
78 : // @param block The block to be decomposed
79 : // @param subgraph The basic-block sub-graph data structure to populate.
80 : // @pre block is safe for basic-block decomposition. That is,
81 : // CodeBlockAttributesAreBasicBlockSafe(block) returns true.
82 m : BasicBlockDecomposer(const BlockGraph::Block* block,
83 m : BasicBlockSubGraph* subgraph);
84 :
85 : // Decomposes a function macro block into its constituent basic blocks.
86 : //
87 : // Immediately following a successful decomposition of a block to
88 : // basic-blocks, subgraph will contain all the basic-blocks found in the
89 : // source block and exactly one block description: that of the source block.
90 : //
91 : // Following decomposition, additional block descriptions can be created,
92 : // new basic blocks added, and basic blocks shuffled between the descriptions.
93 : // The subgraph can then be coalesced back into the BlockGraph from
94 : // which the original block came.
95 m : bool Decompose();
96 :
97 m : protected:
98 m : typedef std::map<Offset, BasicBlockReference> BasicBlockReferenceMap;
99 m : typedef core::AddressSpace<Offset, size_t, BasicBlock*> BBAddressSpace;
100 m : typedef BlockGraph::Block::SourceRange SourceRange;
101 m : typedef BlockGraph::Size Size;
102 m : typedef std::set<Offset> JumpTargets;
103 :
104 : // Returns the source range that coincides with the data range
105 : // [@p offset, [@p offset + @p size) in the original block.
106 m : SourceRange GetSourceRange(Offset offset, Size size) const;
107 :
108 : // Find the basic block, and corresponding byte-range, that contains the
109 : // given offset.
110 : // @param offset the starting offset you with the returned basic-block/range
111 : // to contain.
112 : // @param basic_block the basic-block containing @p offset.
113 : // @param range the byte-range in which @p basic_offset resides, which
114 : // contains @p offset.
115 m : bool FindBasicBlock(Offset offset,
116 m : BasicBlock** basic_block,
117 m : BBAddressSpace::Range* range) const;
118 :
119 : // Find the basic block that begins at the given offset.
120 : // @param offset The starting offset of the basic block you want to find.
121 : // @pre The basic block subgraph is derived from an original block (and
122 : // thus has an address space) and has been broken down into all of its
123 : // constituent basic blocks (i.e., post disassembly and basic-block
124 : // splitting).
125 m : BasicBlock* GetBasicBlockAt(Offset offset) const;
126 :
127 : // Helper function to end the current basic block and begin a new one
128 : // at @p offset. This is only to be used during disassembly.
129 m : bool EndCurrentBasicBlock(Offset end_offset);
130 :
131 : // Walk the function's code in a linear fashion, decomposing the block into
132 : // code and data basic blocks forming an original address space.
133 m : bool Disassemble();
134 :
135 : // Performs an initial linear pass at disassembling the code bytes of the
136 : // block into rudimentary basic blocks. The initial set of basic blocks are
137 : // each terminated at branch points. A subsequent pass will further split
138 : // basic blocks at branch destinations, see SplitCodeBlocksAtBranchTargets().
139 m : bool ParseInstructions();
140 :
141 : // @name Helpers for ParseInstructions().
142 : // @{
143 :
144 : // Initializes jump_targets_ to the set of referenced code locations.
145 : // This covers all locations which are externally referenced, as well as
146 : // those that are internally referenced via jump tables. These jump targets
147 : // may be otherwise un-discoverable through disassembly.
148 : // @param code_begin_offset the offset at which code bytes begin.
149 : // @param code_end_offset the first offset above @p code_begin_offset at
150 : // which the bytes are no longer code.
151 m : void InitJumpTargets(Offset code_begin_offset, Offset code_end_offset);
152 :
153 : // Decode the bytes at @p offset into @p instruction. This function takes
154 : // into consideration the range of offsets which denote code.
155 : // @param offset The offset of into block_ at which to start decoding.
156 : // @param code_end_offset The offset at which the bytes cease to be code.
157 : // @param instruction this value will be populated on success.
158 : // @returns true on success; false otherwise.
159 : // @note Used by ParseInstructions().
160 m : bool DecodeInstruction(Offset offset,
161 m : Offset code_end_offset,
162 m : Instruction* instruction) const;
163 :
164 : // Called for each instruction, this creates the Instruction object
165 : // corresponding to @p instruction, or terminates the current basic block
166 : // if @p instruction is a branch point.
167 : // @param instruction the decoded instruction.
168 : // @param offset the offset at which @p instruction occurs in the block.
169 : // @returns true on success; false otherwise.
170 : // @note Used by ParseInstructions().
171 m : bool HandleInstruction(const Instruction& instruction, Offset offset);
172 :
173 : // @}
174 :
175 : // @name Validation functions.
176 : // @{
177 : // Verifies that every identified jump target in the original code block
178 : // resolves to the start of a basic code block in the original code blocks
179 : // basic-block address space. This is protected for unit-testing purposes.
180 m : void CheckAllJumpTargetsStartABasicCodeBlock() const;
181 :
182 : // Verifies that the address space derived from the original code block
183 : // fully covers the original code block. This is protected for unit-testing
184 : // purposes.
185 m : void CheckHasCompleteBasicBlockCoverage() const;
186 :
187 : // Verifies that all basic blocks in the address space derived from the
188 : // original code block have valid successors or end in an instruction that
189 : // does not yield successors. This is protected for unit-testing purposes.
190 m : void CheckAllControlFlowIsValid() const;
191 :
192 : // Verifies that all labels in the original block are present in the
193 : // decomposed basic-block subgraph.
194 m : void CheckAllLabelsArePreserved() const;
195 : // @}
196 :
197 : // Split code basic-blocks at branch targets such that no basic-block
198 : // has a reference that it not to its head.
199 m : bool SplitCodeBlocksAtBranchTargets();
200 :
201 : // Creates basic blocks for all known data symbols in the block.
202 : // @returns true on success.
203 m : bool FillInDataBlocks();
204 :
205 : // Propagate the referrers from the original block into the basic blocks
206 : // so that referrers can be tracked as the basic blocks are manipulated.
207 m : bool CopyExternalReferrers();
208 :
209 : // Helper function to populate @p refs with the set of references originating
210 : // from its source range in the original block.
211 m : bool CopyReferences(Offset item_offset,
212 m : Size item_size,
213 m : BasicBlockReferenceMap* refs);
214 :
215 : // Propagate the references from the original block into the basic blocks
216 : // so that they can be tracked as the basic blocks are manipulated.
217 m : bool CopyReferences();
218 :
219 : // Resolve intra-block control flow references and referrers.
220 m : bool ResolveSuccessors();
221 :
222 : // Convert any unreachable code basic block into padding basic blocks.
223 m : void MarkUnreachableCodeAsPadding();
224 :
225 : // Inserts a basic block range into the decomposition.
226 m : bool InsertBasicBlockRange(Offset offset,
227 m : size_t size,
228 m : BasicBlockType type);
229 :
230 : // The block being disassembled.
231 m : const BlockGraph::Block* const block_;
232 :
233 : // The basic-block sub-graph to which the block will be decomposed.
234 m : BasicBlockSubGraph* subgraph_;
235 :
236 : // The layout of the original block into basic blocks in subgraph_.
237 m : BBAddressSpace original_address_space_;
238 :
239 : // Tracks locations our conditional branches jump to. Used to fix up basic
240 : // blocks by breaking up those that have a jump target in the middle.
241 m : JumpTargets jump_targets_;
242 :
243 : // The start offset of the current basic block during a walk.
244 m : Offset current_block_start_;
245 :
246 : // The list of instructions in the current basic block.
247 m : BasicBlock::Instructions current_instructions_;
248 :
249 : // The set of successors for the current basic block.
250 m : BasicBlock::Successors current_successors_;
251 :
252 : // A debugging flag indicating whether the decomposition results should be
253 : // CHECKed.
254 m : bool check_decomposition_results_;
255 m : };
256 :
257 m : } // namespace block_graph
258 :
259 : #endif // SYZYGY_BLOCK_GRAPH_BASIC_BLOCK_DECOMPOSER_H_
|