1 : // Copyright 2012 Google Inc. All Rights Reserved.
2 : //
3 : // Licensed under the Apache License, Version 2.0 (the "License");
4 : // you may not use this file except in compliance with the License.
5 : // You may obtain a copy of the License at
6 : //
7 : // http://www.apache.org/licenses/LICENSE-2.0
8 : //
9 : // Unless required by applicable law or agreed to in writing, software
10 : // distributed under the License is distributed on an "AS IS" BASIS,
11 : // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 : // See the License for the specific language governing permissions and
13 : // limitations under the License.
14 : //
15 : // The decomposer decomposes a given image file into a series of blocks
16 : // and references by reference to the image's symbols and disassembled
17 : // executable code.
18 : #ifndef SYZYGY_PE_DECOMPOSER_H_
19 : #define SYZYGY_PE_DECOMPOSER_H_
20 :
21 : #include <windows.h> // NOLINT
22 : #include <dia2.h>
23 : #include <map>
24 : #include <set>
25 : #include <string>
26 : #include <vector>
27 :
28 : #include "base/files/file_path.h"
29 : #include "pcrecpp.h" // NOLINT
30 : #include "syzygy/block_graph/block_graph.h"
31 : #include "syzygy/core/disassembler.h"
32 : #include "syzygy/core/serialization.h"
33 : #include "syzygy/pdb/pdb_data.h"
34 : #include "syzygy/pe/dia_browser.h"
35 : #include "syzygy/pe/image_layout.h"
36 : #include "syzygy/pe/pe_file.h"
37 : #include "syzygy/pe/pe_file_parser.h"
38 :
39 : // Fwd.
40 : namespace pdb {
41 : class PdbStream;
42 : class PdbFile;
43 : } // namespace pdb
44 :
45 : namespace pe {
46 :
47 : class Decomposer {
48 : public:
49 : // A struct for storing fixups.
50 : struct Fixup;
51 : // Used for storing references before the block graph is complete.
52 : struct IntermediateReference;
53 :
54 : typedef block_graph::BlockGraph BlockGraph;
55 : typedef core::AbsoluteAddress AbsoluteAddress;
56 : typedef core::RelativeAddress RelativeAddress;
57 : typedef core::AddressSpace<RelativeAddress, size_t, std::string> DataSpace;
58 : typedef core::Disassembler Disassembler;
59 : typedef Disassembler::CallbackDirective CallbackDirective;
60 : typedef std::map<RelativeAddress, Fixup> FixupMap;
61 : typedef std::map<RelativeAddress, IntermediateReference>
62 : IntermediateReferenceMap;
63 :
64 : // Initializes the decomposer for a given image file.
65 : // @param image_file the image file to decompose.
66 : explicit Decomposer(const PEFile& image_file);
67 :
68 : // Decomposes the image file into a BlockGraph and an ImageLayout, which
69 : // have the breakdown of code and data blocks with typed references and
70 : // information on where the blocks resided in the original image,
71 : // respectively.
72 : // @returns true on success, false on failure. If @p stats is non-null, it
73 : // will be populated with decomposition coverage statistics.
74 : bool Decompose(ImageLayout* image_layout);
75 :
76 : // @{
77 : // TODO(chrisha): Expose a mechanism for bulk-importing these via some JSON
78 : // representation. We will likely want to expose this on the command-line
79 : // of any utility using Decomposer.
80 :
81 : // Registers a pair of static initializer search patterns. Each of these
82 : // patterns will be converted to a regular expression, and they are required
83 : // to produce exactly one match group. The match group must be the same for
84 : // each of the patterns in order for the symbols to be correlated to each
85 : // other.
86 : // @param begin the regular-expression used to find the open bracketing
87 : // symbol.
88 : // @param end the regular-expression used to find the end bracketing symbol.
89 : // @returns true on success, false otherwise.
90 : bool RegisterStaticInitializerPatterns(const base::StringPiece& begin,
91 : const base::StringPiece& end);
92 :
93 : // Registers a function as non-returning. This can be used to set
94 : // no-return semantics for functions whose debug information is missing or
95 : // incomplete.
96 : // @param function_name the undecorated function name.
97 : // @returns true if the function was added, false if it already existed in
98 : // the set.
99 : bool RegisterNonReturningFunction(const base::StringPiece& function_name);
100 :
101 : // Registers an imported symbol as a non-returning function. This can be used
102 : // to set no-return semantics for imported functions (we don't get symbol
103 : // information for these).
104 : // @param module_name the name of the imported module.
105 : // @param function_name the undecorated function name.
106 : // @returns true if the function was added, false if it already existed in
107 : // the set.
108 : bool RegisterNonReturningImport(const base::StringPiece& module_name,
109 : const base::StringPiece& function_name);
110 : // @}
111 :
112 : // Sets the PDB path to be used. If this is not called it will be inferred
113 : // using the information in the module, and searched for using the OS
114 : // search functionality.
115 : // @param pdb_path the path to the PDB file to be used in decomposing the
116 : // image.
117 E : void set_pdb_path(const base::FilePath& pdb_path) { pdb_path_ = pdb_path; }
118 :
119 : // Accessor to the PDB path. If Decompose has been called successfully this
120 : // will reflect the path of the PDB file that was used to perform the
121 : // decomposition.
122 : // @returns the PDB path.
123 E : const base::FilePath& pdb_path() const { return pdb_path_; }
124 :
125 : protected:
126 : typedef std::map<RelativeAddress, std::string> DataLabels;
127 : typedef std::vector<pdb::PdbFixup> PdbFixups;
128 :
129 : // Searches for (if necessary) the PDB file to be used in the decomposition,
130 : // and validates that the file exists and matches the module.
131 : bool FindAndValidatePdbPath();
132 :
133 : // Parse functions and thunks, using their data to annotate blocks.
134 : bool ProcessCodeSymbols(IDiaSymbol* globals);
135 : // Parses all function symbols.
136 : bool ProcessFunctionSymbols(IDiaSymbol* globals);
137 : // Create a function or thunk symbol.
138 : // @pre @p function is a function or a thunk.
139 : bool ProcessFunctionOrThunkSymbol(IDiaSymbol* function);
140 : // Create labels for @p function, which corresponds to @p block.
141 : bool CreateLabelsForFunction(IDiaSymbol* function, BlockGraph::Block* block);
142 : // Create blocks for all thunks in @p globals.
143 : // @note thunks are offspring of Compilands.
144 : bool ProcessThunkSymbols(IDiaSymbol* globals);
145 :
146 : // Enumerates labels in @p globals and adds them to the corresponding (code)
147 : // blocks.
148 : bool CreateGlobalLabels(IDiaSymbol* globals);
149 :
150 : // Creates a gap block of type @p block_type for the given range. For use by
151 : // CreateSectionGapBlocks.
152 : bool CreateGapBlock(BlockGraph::BlockType block_type,
153 : RelativeAddress address,
154 : BlockGraph::Size size);
155 : // Create blocks of type @p block_type for any gaps in the image
156 : // section represented by @p header.
157 : bool CreateSectionGapBlocks(const IMAGE_SECTION_HEADER* header,
158 : BlockGraph::BlockType block_type);
159 : // Creates gap blocks.
160 : bool CreateGapBlocks();
161 :
162 : // Processes the SectionContribution table, creating code/data blocks from it.
163 : bool CreateBlocksFromSectionContribs(IDiaSession* session);
164 :
165 : // Guesses data block alignments and padding.
166 : bool GuessDataBlockAlignments();
167 : // Process static initializer data labels, ensuring they remain contiguous.
168 : bool ProcessStaticInitializers();
169 :
170 : // These process symbols in the DIA tree via DiaBrowser and the following
171 : // callbacks.
172 : bool ProcessDataSymbols(IDiaSymbol* root);
173 : bool ProcessPublicSymbols(IDiaSymbol* root);
174 :
175 : // DiaBrowser callbacks.
176 : DiaBrowser::BrowserDirective OnDataSymbol(
177 : const DiaBrowser& dia_browser,
178 : const DiaBrowser::SymTagVector& sym_tags,
179 : const DiaBrowser::SymbolPtrVector& symbols);
180 : DiaBrowser::BrowserDirective OnPublicSymbol(
181 : const DiaBrowser& dia_browser,
182 : const DiaBrowser::SymTagVector& sym_tags,
183 : const DiaBrowser::SymbolPtrVector& symbols);
184 :
185 : // Translates intermediate references to block->block references.
186 : bool FinalizeIntermediateReferences();
187 :
188 : // Checks that the fixups were all visited.
189 : bool ConfirmFixupsVisited() const;
190 :
191 : // Searches through the final block graph, and labels blocks that are
192 : // simply padding blocks. This must be called after all references are
193 : // finalized.
194 : bool FindPaddingBlocks();
195 :
196 : // Parses the section headers and creates BlockGraph sections.
197 : bool CreateSections();
198 :
199 : // Parses the various debug streams. This populates fixup_map_ as well.
200 : bool LoadDebugStreams(IDiaSession* dia_session);
201 :
202 : // Validates a reference against a matching fixup, or creates a new
203 : // intermediate reference from @p src_addr to @p dst_addr of
204 : // type @p type and size @p size with optional name @p name. This assumes
205 : // an offset of zero.
206 : // @returns true if the reference was successfully added, false otherwise.
207 : bool AddReferenceCallback(RelativeAddress src_addr,
208 : BlockGraph::ReferenceType type,
209 : BlockGraph::Size size,
210 : RelativeAddress dst_addr);
211 : // Parse the relocation entries.
212 : bool ParseRelocs();
213 : // Uses the fixup map to create cross-block references. These contain
214 : // relative references, lookup tables, absolute references, PC-relative from
215 : // code references, etc.
216 : bool CreateReferencesFromFixups();
217 : // Walk relocations and validate them against the fixups.
218 : bool ValidateRelocs(const PEFile::RelocMap& reloc_map);
219 : // Disassemble all code blocks and create code->code references.
220 : bool CreateCodeReferences();
221 : // Disassemble @p block and invoke @p on_instruction for each instruction
222 : // encountered.
223 : bool CreateCodeReferencesForBlock(BlockGraph::Block* block);
224 :
225 : // Parses the PE BlockGraph header and other important PE structures,
226 : // adds them as blocks to the image, and creates the references
227 : // they contain.
228 : bool CreatePEImageBlocksAndReferences(PEFileParser::PEHeader* header);
229 :
230 : // Creates a new block with the given properties, and attaches the
231 : // data to it. This assumes that no conflicting block exists.
232 : BlockGraph::Block* CreateBlock(BlockGraph::BlockType type,
233 : RelativeAddress address,
234 : BlockGraph::Size size,
235 : const base::StringPiece& name);
236 :
237 : enum FindOrCreateBlockDirective {
238 : // Expect that no block exists in the given range and that a block will be
239 : // created.
240 : kExpectNoBlock,
241 : // Allow the existence of a block with identical range to that provided.
242 : kAllowIdenticalBlock,
243 : // Allow the existence of a block that completely covers the provided range.
244 : kAllowCoveringBlock,
245 : // Allow the existence of a block that contains the start of the provided
246 : // range, but which may not fully contain the provided range.
247 : kAllowPartialCoveringBlock,
248 : };
249 : // Create block for the given @p address and @p size of the given @p type,
250 : // or return an existant block that has the same @p type, @p address and
251 : // @p size. Care must be taken in using the returned block. Regardless of the
252 : // provided directive, the block that is returned may be a strict superset
253 : // of the requested range, and offsets into it may need to be calculated.
254 : // @returns the block created or found, or NULL if there's a conflicting block
255 : // for the address range.
256 : BlockGraph::Block* FindOrCreateBlock(BlockGraph::BlockType type,
257 : RelativeAddress address,
258 : BlockGraph::Size size,
259 : const base::StringPiece& name,
260 : FindOrCreateBlockDirective directive);
261 :
262 : // @name OnInstruction helper functions.
263 : // @{
264 : void MarkDisassembledPastEnd();
265 : CallbackDirective LookPastInstructionForData(RelativeAddress instr_end);
266 : CallbackDirective VisitNonFlowControlInstruction(RelativeAddress instr_start,
267 : RelativeAddress instr_end);
268 : CallbackDirective VisitPcRelativeFlowControlInstruction(
269 : AbsoluteAddress instr_abs,
270 : RelativeAddress instr_rel,
271 : const _DInst& instruction,
272 : bool end_of_code);
273 : CallbackDirective VisitIndirectMemoryCallInstruction(
274 : const _DInst& instruction, bool end_of_code);
275 : CallbackDirective OnInstruction(const Disassembler& disassembler,
276 : const _DInst& instruction);
277 : // @}
278 :
279 : // Repairs the DIA "FIXUPS" with any loaded OMAP information, validates them,
280 : // and stores them in the given FixupMap.
281 : bool OmapAndValidateFixups(const std::vector<OMAP>& omap_from,
282 : const PdbFixups& pdb_fixups);
283 :
284 : // Check if there's a block-graph stream in the PDB and load it in this case.
285 : // @param pdb_path The path of the PDB file.
286 : // @param image_file The image file we're decomposing. This is used to set
287 : // block data pointers.
288 : // @param image_layout The image-layout we're trying to populate.
289 : // @param stream_exist A pointer to a boolean to indicate if the block-graph
290 : // stream exists in the PDB.
291 : // @return true if the block-graph has been successfully loaded, false
292 : // otherwise.
293 : bool LoadBlockGraphFromPdb(const base::FilePath& pdb_path,
294 : const PEFile& image_file,
295 : ImageLayout* image,
296 : bool* stream_exists);
297 :
298 : // Load a block-graph from a PDB stream.
299 : // @param image_file The image file we're decomposing. This is used to set
300 : // block data pointers.
301 : // @param block_graph_stream The stream containing the block-graph.
302 : // @param image_layout The image-layout we're trying to populate.
303 : // @return true if the block-graph has been successfully loaded, false
304 : // otherwise.
305 : bool LoadBlockGraphFromPdbStream(const PEFile& image_file,
306 : pdb::PdbStream* block_graph_stream,
307 : ImageLayout* image_layout);
308 :
309 : // Try to get the block-graph stream from a PDB.
310 : // @param pdb_file The PDB file from which the stream will be read.
311 : // @returns a scoped pointer to a the stream in case of success, otherwise
312 : // the pointer will contain a NULL reference.
313 : scoped_refptr<pdb::PdbStream> GetBlockGraphStreamFromPdb(
314 : pdb::PdbFile* pdb_file);
315 :
316 : // Callback for use with PEFileParser. Will set the NON_RETURN_FUNCTION
317 : // attribute for imports that are found in the non_returning_imports_ set.
318 : bool OnImportThunkCallback(const char* module_name,
319 : const char* symbol_name,
320 : BlockGraph::Block* thunk);
321 :
322 : // The image address space we're decomposing to.
323 : BlockGraph::AddressSpace* image_;
324 :
325 : // The image file we're decomposing.
326 : // Note that the resultant BlockGraph will contain pointers to the
327 : // data in the image file, so the user must ensure the image file
328 : // outlives the BlockGraph.
329 : const PEFile& image_file_;
330 :
331 : // The path to the PDB file to be used in decomposing the image.
332 : base::FilePath pdb_path_;
333 :
334 : // Stores intermediate references before the block graph is complete.
335 : IntermediateReferenceMap references_;
336 :
337 : typedef std::set<BlockGraph::Block*> BlockSet;
338 : typedef std::set<BlockGraph::AddressSpace::Range> RangeSet;
339 : typedef std::map<RelativeAddress, std::string> LabelMap;
340 : typedef std::set<RelativeAddress> RelativeAddressSet;
341 : typedef pcrecpp::RE RE;
342 : typedef std::pair<RE, RE> REPair;
343 : typedef std::vector<REPair> REPairs;
344 : typedef std::set<std::string> StringSet;
345 : typedef std::map<std::string, StringSet> StringSetMap;
346 :
347 : // @name State tracking for the disassembler.
348 : // @{
349 : // The block we're currently disassembling. We need this for use in the
350 : // OnInstruction callback.
351 : BlockGraph::Block* current_block_;
352 : // Used to indicate the decomposer's handling of the current block. Needed
353 : // for OnInstruction callback.
354 : bool be_strict_with_current_block_;
355 : // @}
356 :
357 : // Keeps track of reloc entry information, which is used by various
358 : // pieces of the decomposer.
359 : PEFile::RelocSet reloc_set_;
360 :
361 : // Keeps track of fixups, which are necessary if we want to move around
362 : // code and data. These are keyed by the location in the image of the
363 : // reference. We keep them around so that the disassembly phase can be
364 : // validated against them.
365 : FixupMap fixup_map_;
366 : // A set of static initializer search pattern pairs. These are used to
367 : // ensure we don't break up blocks of static initializer function pointers.
368 : REPairs static_initializer_patterns_;
369 : // A set of functions known to be non-returning but not tagged as such in the
370 : // debug symbols.
371 : StringSet non_returning_functions_;
372 : // A map of module names, each containing a set of known non-returning
373 : // functions.
374 : StringSetMap non_returning_imports_;
375 : };
376 :
377 : // This stores fixups, but in a format more convenient for us than the
378 : // basic PdbFixup struct.
379 : struct Decomposer::Fixup {
380 : BlockGraph::ReferenceType type;
381 : bool refers_to_code;
382 : bool is_data;
383 : // Has this fixup been visited by our decomposition?
384 : bool visited;
385 : RelativeAddress location;
386 : RelativeAddress base;
387 : };
388 :
389 : // During decomposition we collect references in this format, e.g.
390 : // address->address. After thunking up the entire image into blocks,
391 : // we convert them to block->block references.
392 : // TODO(siggi): Is there reason to keep these in an address space to guard
393 : // against overlapping references?
394 : struct Decomposer::IntermediateReference {
395 : BlockGraph::ReferenceType type;
396 : BlockGraph::Size size;
397 : // A reference actually takes the form of a pointer that is offset
398 : // from a base address (its intended target). Direct references will
399 : // have offset = 0, but this allows us to represent offset references
400 : // into data as seen in loop induction variables, etc.
401 : RelativeAddress base;
402 : BlockGraph::Offset offset;
403 : };
404 :
405 : } // namespace pe
406 :
407 : #endif // SYZYGY_PE_DECOMPOSER_H_
|