Trie.hpp   [plain text]


/* -*- mode: C++; c-basic-offset: 4; tab-width: 4 -*-
 *
 * Copyright (c) 2008-2010 Apple Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_LICENSE_HEADER_END@
 */

/*
 //This is the exposed iterface for the Trie template
 //TODO: add erase methods
 //TODO: re-enable iterators
 
 template <typename V>
 struct Trie {
	struct Entry
	{
 std::string		name;
 V				info;
 
 Entry(void) {}
 Entry(const std::string& N, V I) : name(N), info(I) {}
 };
 
 struct const_iterator : std::iterator<std::bidirectional_iterator_tag, const Entry>;
 
 const_iterator begin() const;
 const_iterator end() const;
 
 Trie(void);
 Trie(const uint8_t* start, const uint8_t* end);
 Trie(const std::vector<Entry>& entries);
 
 void emit(std::vector<uint8_t>& output);
 */


#ifndef __TRIE__
#define __TRIE__
#define TRIE_DEBUG (0)

#include <algorithm>
#include <vector>
#include <memory>
#include <string>
#include <map>
#include <iterator>

#include "MachOFileAbstraction.hpp"
#include "kextsplit.h"

#if __cplusplus <= 201103L
namespace std {
    template<typename T, typename... Args>
    std::unique_ptr<T> make_unique(Args&&... args)
    {
        return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
    }
}
#endif

namespace TrieUtils {
    
    static void append_uleb128(uint64_t value, std::vector<uint8_t>& out) {
        uint8_t byte;
        do {
            byte = value & 0x7F;
            value &= ~0x7F;
            if ( value != 0 )
                byte |= 0x80;
            out.push_back(byte);
            value = value >> 7;
        } while( byte >= 0x80 );
    }
    
    static void append_string(std::string str, std::vector<uint8_t>& out) {
        for(char& c : str)
            out.push_back(c);
        out.push_back('\0');
    }
    
    static unsigned int	uleb128_size(uint64_t value) {
        uint32_t result = 0;
        do {
            value = value >> 7;
            ++result;
        } while ( value != 0 );
        return result;
    }
    
    static
    inline uint64_t read_uleb128(const uint8_t*& p, const uint8_t* end) {
        uint64_t result = 0;
        int		 bit = 0;
        do {
            if (p == end)
#if __EXCEPTIONS
                throw "malformed uleb128 extends beyond trie";
#else
            kcgen_terminate("malformed uleb128 extends beyond trie");
#endif
            uint64_t slice = *p & 0x7f;
            
            if (bit >= 64 || slice << bit >> bit != slice)
#if __EXCEPTIONS
                throw "uleb128 too big for 64-bits";
#else
            kcgen_terminate("malformed uleb128 extends beyond trie");
#endif
            else {
                result |= (slice << bit);
                bit += 7;
            }
        }
        while (*p++ & 0x80);
        return result;
    }
};

template <typename V>
struct Trie {
    uint32_t count;
    uint32_t nodeCount;
    
    struct Entry
    {
        std::string		name;
        V				info;
        
        Entry(void) {}
        Entry(const std::string& N, V I) : name(N), info(I) {}
    };
    
    Trie(const std::vector<Entry>& entries) : count(0), nodeCount(1) {
        // make nodes for all exported symbols
        for (auto& entry : entries) {
            addEntry(entry);
        }
    }
    
    void emit(std::vector<uint8_t>& output) {
        // create vector of nodes
        std::vector<Node*> orderedNodes;
        orderedNodes.reserve(nodeCount);
        orderTrie(&root, orderedNodes);
        
        // assign each node in the vector an offset in the trie stream, iterating until all uleb128 sizes have stabilized
        bool more;
        do {
            uint32_t offset = 0;
            more = false;
            for (auto& node : orderedNodes) {
                if (node->updateOffset(offset)) {
                    more = true;
                }
            }
        } while ( more );
        
        // create trie stream
        for (auto& node : orderedNodes) {
            node->appendToStream(output);
        }
    }
    
    static
    inline void parseTrie(const uint8_t* start, const uint8_t* end, std::vector<Entry>& output)
    {
        // empty trie has no entries
        if ( start == end )
            return;
        char cummulativeString[4000];
        std::vector<EntryWithOffset> entries;
        processExportNode(start, start, end, cummulativeString, 0, entries);
        // to preserve tie layout order, sort by node offset
        std::sort(entries.begin(), entries.end());
        // copy to output
        output.reserve(entries.size());
        for (auto& entryWithOffset : entries) {
            output.push_back(entryWithOffset.entry);
        }
    }
    
private:
    struct Node
    {
        //This needs to be a map to unsure deterministic ordering of tries.
        std::map<std::string,std::unique_ptr<Node> > fChildren;
        bool				fIsTerminal;
        uint32_t			fTrieOffset;
        V					fInfo;
        
        Node(void) : fIsTerminal(false), fTrieOffset(0) {}
        Node(V v) : fIsTerminal(true), fTrieOffset(0), fInfo(v) {}
        Node(Node&&) = default;
        
        // byte for terminal node size in bytes, or 0x00 if not terminal node
        // teminal node (uleb128 flags, uleb128 addr [uleb128 other])
        // byte for child node count
        //  each child: zero terminated substring, uleb128 node offset
        bool updateOffset(uint32_t& offset) {
            uint32_t nodeSize = 1; // length of export info when no export info
            if ( fIsTerminal ) {
                nodeSize = fInfo.encodedSize();
                // do have export info, overall node size so far is uleb128 of export info + export info
                nodeSize += TrieUtils::uleb128_size(nodeSize);
            }
            // add children
            ++nodeSize; // byte for count of chidren
            
            for (auto &edge : fChildren) {
                nodeSize += edge.first.length() + 1 + TrieUtils::uleb128_size(edge.second->fTrieOffset);
            }
            
            bool result = (fTrieOffset != offset);
            fTrieOffset = offset;
            //fprintf(stderr, "updateOffset %p %05d %s\n", this, fTrieOffset, fCummulativeString);
            offset += nodeSize;
            // return true if fTrieOffset was changed
            return result;
        }
        
        void appendToStream(std::vector<uint8_t>& out) {
            if ( fIsTerminal ) {
                fInfo.appendToStream(out);
            }
            else {
                // no export info uleb128 of zero is one byte of zero
                out.push_back(0);
            }
            // write number of children
            out.push_back(fChildren.size());
            // write each child
            for (auto &edge : fChildren) {
                TrieUtils::append_string(edge.first, out);
                TrieUtils::append_uleb128(edge.second->fTrieOffset, out);
            }
        }
    };
    
    Node root;
    
    struct EntryWithOffset
    {
        uintptr_t		nodeOffset;
        Entry			entry;
        
        bool operator<(const EntryWithOffset& other) const { return ( nodeOffset < other.nodeOffset ); }
    };
    
    void addEntry(const std::string& fullStr, std::string::const_iterator start, V v) {
        Node *currentNode = &root;
        bool done = false;
        
        while (!done && !currentNode->fChildren.empty() ) {
            done = true;
            
            for (auto &entry : currentNode->fChildren) {
                auto res = std::mismatch(entry.first.begin(), entry.first.end(), start);
                
                if (res.first ==  entry.first.end()) {
                    //Matched a full edge, go down it
                    done = false;
                    currentNode = entry.second.get();
                    start = res.second;
                    break;
                } else if (res.first != entry.first.begin()) {
                    // found a common substring, splice in new node
                    //  was A -> C,  now A -> B -> C
                    
                    //Build the new strings
                    std::string abEdgeStr(entry.first.begin(), res.first);
                    std::string bcEdgeStr(res.first, entry.first.end());
                    
                    //Copy out the exist node and delete it from the currentNode
                    std::unique_ptr<Node> nodeC;
                    std::swap(nodeC, entry.second);
                    currentNode->fChildren.erase(entry.first);
                    
                    //Build the new node and insert it
                    std::unique_ptr<Node> nodeB = std::make_unique<Node>();
                    Node *newNode = nodeB.get();
                    
                    nodeB->fChildren.insert(std::make_pair(bcEdgeStr, std::move(nodeC)));
                    currentNode->fChildren.insert(std::make_pair(abEdgeStr, std::move(nodeB)));
                    
                    currentNode = newNode;
                    start = res.second;
                    ++nodeCount;
                    break;
                }
            }
        }
        
        // no commonality with any existing child, make a new edge that is this whole string
        std::string edgeStr(start, fullStr.end());
        v.willInsertAs(fullStr);
        
        if (edgeStr.empty()) {
            currentNode->fIsTerminal = true;
            currentNode->fInfo = v;
        } else {
            currentNode->fChildren.emplace(edgeStr, std::make_unique<Node>(v));
            ++nodeCount;
        }
        ++count;
    }
    
    void addEntry(Entry entry) {
        addEntry(entry.name, entry.name.begin(), entry.info);
    }
    
#if TRIE_DEBUG
    void printTrie(Node& node, std::string cummulativeString) {
        if (node.fTerminal) {
            printf("%s: \n", cummulativeString.c_str());
        }
        for (auto &edge : node.fChildren) {
            printTrie(*edge.second, cummulativeString+edge.first);
        }
    }
    
public:
    void printTrie(void) {
        printTrie(root, "");
    }
private:
#endif
    
    static inline void processExportNode(const uint8_t* const start, const uint8_t* p, const uint8_t* const end,
                                         char* cummulativeString, int curStrOffset,
                                         std::vector<EntryWithOffset>& output)
    {
        if ( p >= end )
#if __EXCEPTIONS
            throw "malformed trie, node past end";
#else
        kcgen_terminate("malformed trie, node past end");
#endif
        const uint8_t terminalSize = read_uleb128(p, end);
        const uint8_t* children = p + terminalSize;
        if ( terminalSize != 0 ) {
            EntryWithOffset e;
            e.nodeOffset = p-start;
            e.entry.name = cummulativeString;
            e.entry.info.loadData(p,end);
            output.push_back(e);
        }
        const uint8_t childrenCount = *children++;
        const uint8_t* s = children;
        for (uint8_t i=0; i < childrenCount; ++i) {
            int edgeStrLen = 0;
            while (*s != '\0') {
                cummulativeString[curStrOffset+edgeStrLen] = *s++;
                ++edgeStrLen;
            }
            cummulativeString[curStrOffset+edgeStrLen] = *s++;
            uint32_t childNodeOffet = (uint32_t)read_uleb128(s, end);
            processExportNode(start, start+childNodeOffet, end, cummulativeString, curStrOffset+edgeStrLen, output);
        }
    }
    
    void orderTrie(Node* node, std::vector<Node*>& orderedNodes) {
        orderedNodes.push_back(node);
        for (auto &edge : node->fChildren) {
            orderTrie(edge.second.get(), orderedNodes);
        }
    }
}; // struct Trie

struct ExportInfo {
    uint64_t		address;
    uint64_t		flags;
    uint64_t		other;
    std::string		importName;
    
    ExportInfo() : address(0), other(0) {}
//    ExportInfo() : other(0), address(0) {}
    
    uint32_t encodedSize(void) {
        uint32_t size = 0;
        if ( flags & EXPORT_SYMBOL_FLAGS_REEXPORT ) {
            size = TrieUtils::uleb128_size(flags) + TrieUtils::uleb128_size(other); // ordinal
            if ( !importName.empty() )
                size += importName.length();
            ++size; // trailing zero in imported name
        }
        else {
            size = TrieUtils::uleb128_size(flags) + TrieUtils::uleb128_size(address);
            if ( flags & EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER )
                size += TrieUtils::uleb128_size(other);
        }
        return size;
    }
    
    void appendToStream(std::vector<uint8_t>& out) {
        if ( flags & EXPORT_SYMBOL_FLAGS_REEXPORT ) {
            if ( !importName.empty() ) {
                // nodes with re-export info: size, flags, ordinal, string
                uint32_t nodeSize = (uint32_t)(TrieUtils::uleb128_size(flags) + TrieUtils::uleb128_size(other) + importName.length() + 1);
                out.push_back(nodeSize);
                TrieUtils::append_uleb128(flags, out);
                TrieUtils::append_uleb128(other, out);
                TrieUtils::append_string(importName, out);
            }
            else {
                // nodes with re-export info: size, flags, ordinal, empty-string
                uint32_t nodeSize = TrieUtils::uleb128_size(flags) + TrieUtils::uleb128_size(other) + 1;
                out.push_back(nodeSize);
                TrieUtils::append_uleb128(flags, out);
                TrieUtils::append_uleb128(other, out);
                out.push_back(0);
            }
        }
        else if ( flags & EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER ) {
            // nodes with export info: size, flags, address, other
            uint32_t nodeSize = TrieUtils::uleb128_size(flags) + TrieUtils::uleb128_size(address) + TrieUtils::uleb128_size(other);
            out.push_back(nodeSize);
            TrieUtils::append_uleb128(flags, out);
            TrieUtils::append_uleb128(address, out);
            TrieUtils::append_uleb128(other, out);
        }
        else {
            // nodes with export info: size, flags, address
            uint32_t nodeSize = TrieUtils::uleb128_size(flags) + TrieUtils::uleb128_size(address);
            out.push_back(nodeSize);
            TrieUtils::append_uleb128(flags, out);
            TrieUtils::append_uleb128(address, out);
        }
    }
    
    void loadData(const uint8_t* p, const uint8_t* const end) {
        flags = read_uleb128(p, end);
        if ( flags & EXPORT_SYMBOL_FLAGS_REEXPORT ) {
            other = read_uleb128(p, end); // dylib ordinal
            importName = (char*)p;
        }
        else {
            address = read_uleb128(p, end);
            if ( flags & EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER )
                other = read_uleb128(p, end);
        }
    }
    
    void willInsertAs(const std::string& name) {
        // Symbols re-exported under the same name do not need an explict import name, delete it to save space
        if ((name == importName) ) {
            importName = "";
        }
    }
};

typedef Trie<ExportInfo> ExportInfoTrie;


// Used by accelerator tables in dyld shared cache
struct DylibIndex {
    uint32_t		index;
    
    DylibIndex() : index(0) {}
    DylibIndex(uint32_t i) : index(i) {}
    
    uint32_t encodedSize(void) {
        return TrieUtils::uleb128_size(index);
    }
    
    void appendToStream(std::vector<uint8_t>& out) {
        uint32_t nodeSize = TrieUtils::uleb128_size(index);
        out.push_back(nodeSize);
        TrieUtils::append_uleb128(index, out);
    }
    
    void loadData(const uint8_t* p, const uint8_t* const end) {
        index = (uint32_t)TrieUtils::read_uleb128(p, end);
    }
    
};
typedef Trie<DylibIndex> DylibIndexTrie;


#endif	// __TRIE__