// Philipp Klaus Krause, philipp@informatik.uni-frankfurt.de, pkk@spth.de, 2010 - 2013 // // (c) 2010-2012 Goethe-Universität Frankfurt // // This program is free software; you can redistribute it and/or modify it // under the terms of the GNU General Public License as published by the // Free Software Foundation; either version 2, or (at your option) any // later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. // // An optimal, polynomial-time register allocator. // // For details, see: // // Philipp Klaus Krause, // "Optimal Register Allocation in Polynomial Time", // Compiler Construction - 22nd International Conference, CC 2013, Held as Part of the European Joint Conferences on Theory and Practice of Software, ETAPS 2013. Proceedings, // Lecture Notes in Computer Science, volume 7791, pp. 1-20. // Springer, // 2013. // // To use this from a port do the following: // // 1) Supply a cost function // template // float instruction_cost(const assignment &a, unsigned short int i, const G_t &G, const I_t &I); // Which can range from // simple, e.g. cost 1 for each byte accessed in a register, cost 4 for each byte accessed in memory // to // quite involved, e.g. the number of bytes of code the code generator would generate. // // 2) Call // create_cfg(), thorup_tree_decomposition(), nicify(), alive_tree_dec(), tree_dec_ralloc_nodes(). // // The Z80 port can serve as an example, see z80_ralloc2_cc() in z80/ralloc2.cc. #ifndef SDCCRALLOC_HH #define SDCCRALLOC_HH 1 #include #include #include #include #include // Workaround for boost bug #11880 #include #if BOOST_VERSION == 106000 #include #endif #include #include #include #include #include #include "common.h" extern "C" { #include "SDCCbtree.h" } typedef short int var_t; typedef signed char reg_t; // Integer constant upper bound on port->num_regs #define MAX_NUM_REGS 9 // Assignment at an instruction struct i_assignment_t { var_t registers[MAX_NUM_REGS][2]; i_assignment_t(void) { for (reg_t r = 0; r < MAX_NUM_REGS; r++) for (unsigned int i = 0; i < 2; i++) registers[r][i] = -1; } #if 0 bool operator<(const i_assignment_t &i_a) const { for (reg_t r = 0; r < port->num_regs; r++) for (unsigned int i = 0; i < 2; i++) { if (registers[r][i] < i_a.registers[r][i]) return(true); else if (registers[r][i] > i_a.registers[r][i]) return(false); } return(false); } #endif void add_var(var_t v, reg_t r) { if (registers[r][1] < v) { registers[r][0] = registers[r][1]; registers[r][1] = v; } else registers[r][0] = v; } void remove_var(var_t v) { for (reg_t r = 0; r < port->num_regs; r++) { if (registers[r][1] == v) { registers[r][1] = registers[r][0]; registers[r][0] = -1; } else if (registers[r][0] == v) registers[r][0] = -1; } } }; typedef std::vector varset_t; // Faster than std::set, std::tr1::unordered_set and stx::btree_set here. typedef boost::container::flat_map icosts_t; // Faster than std::map and stx::btree_map here. typedef std::vector cfg_alive_t; // Faster than stx::btree_set here . typedef boost::container::flat_set cfg_dying_t; // Faster than stx::btree_set and std::set here. struct assignment { float s; varset_t local; // Entries: var std::vector global; // Entries: global[var] = reg (-1 if no reg assigned) icosts_t i_costs; // Costs for all instructions in bag (needed to avoid double counting costs at join nodes) i_assignment_t i_assignment; // Assignment at the instruction currently being added in an introduce node; bool marked; bool operator<(const assignment& a) const { varset_t::const_iterator i, ai, i_end, ai_end; i_end = local.end(); ai_end = a.local.end(); for (i = local.begin(), ai = a.local.begin();; ++i, ++ai) { if (i == i_end && ai == ai_end) return(false); if (i == i_end) return(true); if (ai == ai_end) return(false); if (*i < *ai) return(true); if (*i > *ai) return(false); if (global[*i] < a.global[*ai]) return(true); if (global[*i] > a.global[*ai]) return(false); } } }; typedef std::list assignment_list_t; //typedef std::vector assignment_list_t; // Probably faster, but would require some code reorganization. struct tree_dec_node { std::set bag; std::set alive; assignment_list_t assignments; unsigned weight; // The weight is the number of nodes at which intermediate results need to be remembered. In general, to minimize memory consumption, at join nodes the child with maximum weight should be processed first. }; typedef boost::container::flat_multimap operand_map_t; // Faster than std::multimap and stx::btree_multimap here. struct cfg_node { iCode *ic; operand_map_t operands; cfg_alive_t alive; cfg_dying_t dying; std::set stack_alive; #ifdef DEBUG_SEGV cfg_node(void); #endif }; #ifdef DEBUG_SEGV // This only exists to track down #3506333 and #3475617. bool default_constructor_of_cfg_node_called; cfg_node::cfg_node(void) { default_constructor_of_cfg_node_called = true; } #endif struct con_node { int v; int byte; int size; char *name; }; typedef boost::adjacency_list tree_dec_t; typedef boost::adjacency_list con_t; typedef boost::adjacency_matrix con2_t; typedef boost::adjacency_list cfg_t; typedef boost::adjacency_list cfg_sym_t; #ifdef HAVE_TREEDEC_COMBINATIONS_HPP #include TREEDEC_TREEDEC_BAG_TRAITS(tree_dec_t, bag); #endif #include "SDCCtree_dec.hpp" // Cost function. Port-specific. template static float instruction_cost(const assignment &a, unsigned short int i, const G_t &G, const I_t &I); // For early removel of assignments that cannot be extended to valid assignments. Port-specific. template static bool assignment_hopeless(const assignment &a, unsigned short int i, const G_t &G, const I_t &I, const var_t lastvar); // Rough cost estimate. Port-specific. template static float rough_cost_estimate(const assignment &a, unsigned short int i, const G_t &G, const I_t &I); // Avoid overwriting operands that are still needed by the result. Port-specific. template static void add_operand_conflicts_in_node(const cfg_node &n, I_t &I); // Port-specific template static void get_best_local_assignment_biased(assignment &a, typename boost::graph_traits::vertex_descriptor t, const T_t &T); // Code for another ic is generated when generating this one. Mark the other as generated. Port-specific. static void extra_ic_generated(iCode *ic); inline void add_operand_to_cfg_node(cfg_node &n, operand *o, std::map, var_t> &sym_to_index) { reg_t k; if (o && IS_SYMOP(o) && sym_to_index.find(std::pair(OP_SYMBOL_CONST(o)->key, 0)) != sym_to_index.end()) { if (n.operands.find(OP_SYMBOL_CONST(o)->key) == n.operands.end()) for (k = 0; k < OP_SYMBOL_CONST(o)->nRegs; k++) n.operands.insert(std::pair(OP_SYMBOL_CONST(o)->key, sym_to_index[std::pair(OP_SYMBOL_CONST(o)->key, k)])); } } // Check if the live-range of variable i is connected #if 0 // This check was too expensive - Profiling shows that compiling the Dhrystone benchmark for stm8 with default options, we spent about a quarter of compiler runtime in here! // Profiling shows that we spent a significant amount of time on the first call to copy_graph() // Todo: Improve efficiency, e.g. using subgraph or filtered_graph to avoid the costly first call to copy_graph() // Issues to solve: cfg2 is undirected, cfg is bidirectional; this makes use of subgraph or filtered_graph harder. static bool liverange_connected(cfg_t &cfg, var_t i) { cfg_sym_t cfg2; boost::copy_graph(cfg, cfg2, boost::vertex_copy(forget_properties()).edge_copy(forget_properties())); // This call to copy_graph is expensive! for (int j = boost::num_vertices(cfg) - 1; j >= 0; j--) { if (std::find(cfg[j].alive.begin(), cfg[j].alive.end(), i) == cfg[j].alive.end()) { boost::clear_vertex(j, cfg2); boost::remove_vertex(j, cfg2); } } std::vector::vertices_size_type> component(num_vertices(cfg2)); return(boost::connected_components(cfg2, &component[0]) <= 1); } #else // A not very elegant, but faster check static inline int component_size_impl(const cfg_t &cfg, const std::vector &life, var_t v, int i, std::vector& visited) { typename boost::graph_traits::in_edge_iterator in, in_end; typename boost::graph_traits::out_edge_iterator out, out_end; int size = 1; visited[i] = true; for(boost::tie(in, in_end) = boost::in_edges(i, cfg); in != in_end; ++in) if(life[boost::source(*in, cfg)] && !visited[boost::source(*in, cfg)]) size += component_size_impl(cfg, life, v, boost::source(*in, cfg), visited); for(boost::tie(out, out_end) = boost::out_edges(i, cfg); out != out_end; ++out) if(life[boost::target(*out, cfg)] && !visited[boost::target(*out, cfg)]) size += component_size_impl(cfg, life, v, boost::target(*out, cfg), visited); return(size); } static inline int component_size(const cfg_t &cfg, const std::vector &life, var_t v, int i) { std::vector visited(boost::num_vertices(cfg)); return(component_size_impl(cfg, life, v, i, visited)); } static bool liverange_connected(const cfg_t &cfg, var_t v) { std::vector life(boost::num_vertices(cfg)); int num_life = 0; int last_life; for(int i = 0; i < boost::num_vertices (cfg); i++) if(std::find(cfg[i].alive.begin(), cfg[i].alive.end(), v) != cfg[i].alive.end()) { life[i] = true; num_life++; last_life = i; } if(!num_life) return(true); return(component_size(cfg, life, v, last_life) >= num_life); } #endif // A quick-and-dirty function to get the CFG from sdcc. static iCode * create_cfg(cfg_t &cfg, con_t &con, ebbIndex *ebbi) { eBBlock **ebbs = ebbi->bbOrder; iCode *start_ic; iCode *ic; std::map key_to_index; std::map, var_t> sym_to_index; if(currFunc) currFunc->funcDivFlagSafe = 1; start_ic = iCodeLabelOptimize(iCodeFromeBBlock (ebbs, ebbi->count)); { int i; var_t j; wassertl (!boost::num_vertices(cfg), "CFG non-empty before creation."); for (ic = start_ic, i = 0, j = 0; ic; ic = ic->next, i++) { if (currFunc) currFunc->funcDivFlagSafe &= !(ic->op == INLINEASM || ic->op == '/' || ic->op == '%' || ic->op == PCALL || ic->op == CALL && (IS_OP_LITERAL (IC_LEFT (ic)) || !OP_SYMBOL(IC_LEFT (ic))->funcDivFlagSafe) || ic->op == RIGHT_OP && IS_OP_LITERAL (IC_RIGHT (ic))); // Right shift might be implemented using division. #ifdef DEBUG_SEGV default_constructor_of_cfg_node_called = false; #endif boost::add_vertex(cfg); #ifdef DEBUG_SEGV wassertl (default_constructor_of_cfg_node_called, "add_vertex failed to call default constructor of cfg_node!"); #endif wassertl (cfg[i].alive.empty(), "Alive set non-empty upon creation."); key_to_index[ic->key] = i; if(ic->op == SEND && ic->builtinSEND) // Ensure that only the very first send iCode is active. { operand *bi_parms[MAX_BUILTIN_ARGS]; int nbi_parms; getBuiltinParms(ic, &nbi_parms, bi_parms); } extra_ic_generated(ic); cfg[i].ic = ic; ic->rSurv = newBitVect(port->num_regs); // Never freed. Memory leak? ic->rMask = newBitVect(port->num_regs); // Never freed. Memory leak? if (ic->generated) continue; for (int j2 = 0; j2 <= operandKey; j2++) { if (bitVectBitValue(ic->rlive, j2)) { symbol *sym = (symbol *)(hTabItemWithKey(liveRanges, j2)); if (!sym->for_newralloc) continue; // Add node to conflict graph: if (sym_to_index.find(std::pair(j2, 0)) != sym_to_index.end()) continue; // Other parts of the allocator may rely on the variables corresponding to bytes from the same sdcc variable to have subsequent numbers. for (reg_t k = 0; k < sym->nRegs; k++) { boost::add_vertex(con); con[j].v = j2; con[j].byte = k; con[j].size = sym->nRegs; con[j].name = sym->name; sym_to_index[std::pair(j2, k)] = j; for (reg_t l = 0; l < k; l++) boost::add_edge(j - l - 1, j, con); j++; } } } } } // Get control flow graph from sdcc. for (ic = start_ic; ic; ic = ic->next) { wassertl (key_to_index[ic->key] < boost::num_vertices(cfg), "Node not in CFG."); if (ic->op != GOTO && ic->op != RETURN && ic->op != JUMPTABLE && ic->next) { wassertl (key_to_index[ic->next->key] < boost::num_vertices(cfg), "Next node not in CFG."); boost::add_edge(key_to_index[ic->key], key_to_index[ic->next->key], cfg); } if (ic->op == GOTO) { wassertl (key_to_index[eBBWithEntryLabel(ebbi, ic->label)->sch->key] < boost::num_vertices(cfg), "GOTO target not in CFG."); boost::add_edge(key_to_index[ic->key], key_to_index[eBBWithEntryLabel(ebbi, ic->label)->sch->key], cfg); } else if (ic->op == RETURN) { wassertl (key_to_index[eBBWithEntryLabel(ebbi, returnLabel)->sch->key] < boost::num_vertices(cfg), "RETURN target not in CFG."); boost::add_edge(key_to_index[ic->key], key_to_index[eBBWithEntryLabel(ebbi, returnLabel)->sch->key], cfg); } else if (ic->op == IFX) { wassertl (key_to_index[eBBWithEntryLabel(ebbi, IC_TRUE(ic) ? IC_TRUE(ic) : IC_FALSE(ic))->sch->key] < boost::num_vertices(cfg), "IFX target not in CFG."); boost::add_edge(key_to_index[ic->key], key_to_index[eBBWithEntryLabel(ebbi, IC_TRUE(ic) ? IC_TRUE(ic) : IC_FALSE(ic))->sch->key], cfg); } else if (ic->op == JUMPTABLE) for (symbol *lbl = (symbol *)(setFirstItem (IC_JTLABELS (ic))); lbl; lbl = (symbol *)(setNextItem (IC_JTLABELS (ic)))) { wassertl (key_to_index[eBBWithEntryLabel(ebbi, lbl)->sch->key] < boost::num_vertices(cfg), "GOTO target not in CFG."); boost::add_edge(key_to_index[ic->key], key_to_index[eBBWithEntryLabel(ebbi, lbl)->sch->key], cfg); } for (int i = 0; i <= operandKey; i++) { if (sym_to_index.find(std::pair(i, 0)) == sym_to_index.end()) continue; if (bitVectBitValue(ic->rlive, i)) { symbol *isym = (symbol *)(hTabItemWithKey(liveRanges, i)); for (reg_t k = 0; k < isym->nRegs; k++) { wassert (key_to_index.find(ic->key) != key_to_index.end()); wassert (sym_to_index.find(std::pair(i, k)) != sym_to_index.end()); wassertl (key_to_index[ic->key] < boost::num_vertices(cfg), "Node not in CFG."); cfg[key_to_index[ic->key]].alive.push_back(sym_to_index[std::pair(i, k)]); } // TODO: Move this to a place where it also works when using the old allocator! isym->block = btree_lowest_common_ancestor(isym->block, ic->block); // If this symbol has a spill location, ensure the spill location is also allocated in a compatible block if (SYM_SPIL_LOC(isym)) SYM_SPIL_LOC(isym)->block = btree_lowest_common_ancestor(SYM_SPIL_LOC(isym)->block, isym->block); } } if (ic->op == IFX) add_operand_to_cfg_node(cfg[key_to_index[ic->key]], IC_COND(ic), sym_to_index); else if (ic->op == JUMPTABLE) add_operand_to_cfg_node(cfg[key_to_index[ic->key]], IC_JTCOND(ic), sym_to_index); else { add_operand_to_cfg_node(cfg[key_to_index[ic->key]], IC_RESULT(ic), sym_to_index); add_operand_to_cfg_node(cfg[key_to_index[ic->key]], IC_LEFT(ic), sym_to_index); add_operand_to_cfg_node(cfg[key_to_index[ic->key]], IC_RIGHT(ic), sym_to_index); } // TODO: Extend live-ranges of returns of built-in function calls back to first SEND. add_operand_conflicts_in_node(cfg[key_to_index[ic->key]], con); } #if 0 // Get conflict graph from sdcc for (var_t i = 0; static_cast::vertices_size_type>(i) < num_vertices(con); i++) { symbol *isym = (symbol *)(hTabItemWithKey(liveRanges, con[i].v)); for (int j = 0; j <= operandKey; j++) if (bitVectBitValue(isym->clashes, j)) { symbol *jsym = (symbol *)(hTabItemWithKey(liveRanges, j)); if (sym_to_index.find(std::pair(j, 0)) == sym_to_index.end()) continue; for (reg_t k = 0; k < jsym->nRegs; k++) boost::add_edge(i, sym_to_index[std::pair(j, k)], con); } } #endif // Check for unconnected live ranges, some might have survived earlier stages. for (var_t i = (var_t)boost::num_vertices(con) - 1; i >= 0; i--) if (!liverange_connected(cfg, i)) { // Non-connected CFGs are created by at least GCSE and lospre. We now have a live-range splitter that fixes them, so this should no longer be necessary, but we leave this code here for now, so in case one gets through, we can still generate correct code. std::cerr << "Warning: Non-connected liverange found and extended to connected component of the CFG:" << con[i].name << ". Please contact sdcc authors with source code to reproduce.\n"; cfg_sym_t cfg2; boost::copy_graph(cfg, cfg2, boost::vertex_copy(forget_properties()).edge_copy(forget_properties())); std::vector::vertices_size_type> component(num_vertices(cfg2)); boost::connected_components(cfg2, &component[0]); for (boost::graph_traits::vertices_size_type j = 0; j < boost::num_vertices(cfg) - 1; j++) { if (std::find(cfg[j].alive.begin(), cfg[j].alive.end(), i) == cfg[j].alive.end()) continue; for (boost::graph_traits::vertices_size_type k = 0; k < boost::num_vertices(cfg) - 1; k++) if (component[j] == component[k] && std::find(cfg[k].alive.begin(), cfg[k].alive.end(), i) == cfg[k].alive.end()) cfg[k].alive.push_back(i); } } // Sort alive and setup dying. for (boost::graph_traits::vertices_size_type i = 0; i < num_vertices(cfg); i++) { std::sort(cfg[i].alive.begin(), cfg[i].alive.end()); cfg[i].dying = cfg_dying_t(cfg[i].alive.begin(), cfg[i].alive.end());; typedef boost::graph_traits::adjacency_iterator adjacency_iter_t; adjacency_iter_t j, j_end; for (boost::tie(j, j_end) = adjacent_vertices(i, cfg); j != j_end; ++j) { cfg_alive_t::const_iterator v, v_end; for (v = cfg[*j].alive.begin(), v_end = cfg[*j].alive.end(); v != v_end; ++v) { const symbol *const vsym = (symbol *)(hTabItemWithKey(liveRanges, con[*v].v)); const operand *const left = IC_LEFT(cfg[*j].ic); const operand *const right = IC_RIGHT(cfg[*j].ic); const operand *const result = IC_RESULT(cfg[*j].ic); if (!(POINTER_SET(cfg[*j].ic) || cfg[*j].ic->op == SET_VALUE_AT_ADDRESS) && (!left || !IS_SYMOP(left) || OP_SYMBOL_CONST(left)->key != vsym->key) && (!right || !IS_SYMOP(right) || OP_SYMBOL_CONST(right)->key != vsym->key) && result && IS_SYMOP(result) && OP_SYMBOL_CONST(result)->key == vsym->key) continue; cfg[i].dying.erase(*v); } } } // Construct conflict graph for (boost::graph_traits::vertices_size_type i = 0; i < num_vertices(cfg); i++) { cfg_alive_t::const_iterator v, v_end; const iCode *ic = cfg[i].ic; for (v = cfg[i].alive.begin(), v_end = cfg[i].alive.end(); v != v_end; ++v) { cfg_alive_t::const_iterator v2, v2_end; // Conflict between operands are handled by add_operand_conflicts_in_node(). if (cfg[i].dying.find (*v) != cfg[i].dying.end()) continue; if (ic->op != IFX && ic->op != JUMPTABLE && IC_RESULT(ic) && IS_SYMOP(IC_RESULT(ic))) { operand_map_t::const_iterator oi, oi_end; for(boost::tie(oi, oi_end) = cfg[i].operands.equal_range(OP_SYMBOL_CONST(IC_RESULT(ic))->key); oi != oi_end; ++oi) if(oi->second == *v) goto next_var; } // Here, v is a variable that survives cfg[i]. // TODO: Check if we can use v, ++v2 instead of cfg[i].alive.begin() to speed things up. for (v2 = cfg[i].alive.begin(), v2_end = cfg[i].alive.end(); v2 != v2_end; ++v2) { if(*v == *v2) continue; if (cfg[i].dying.find (*v2) != cfg[i].dying.end()) continue; boost::add_edge(*v, *v2, con); } next_var: ; } } return(start_ic); } // Computes live ranges for tree decomposition from live ranges from cfg. inline void alive_tree_dec(tree_dec_t &tree_dec, const cfg_t &cfg) { for (unsigned int i = 0; i < num_vertices(tree_dec); i++) { std::set::const_iterator v; for (v = tree_dec[i].bag.begin(); v != tree_dec[i].bag.end(); ++v) tree_dec[i].alive.insert(cfg[*v].alive.begin(), cfg[*v].alive.end()); } } #if defined(DEBUG_RALLOC_DEC) || defined (DEBUG_RALLOC_DEC_ASS) static void print_assignment(const assignment &a) { varset_t::const_iterator i; std::cout << "["; for (i = a.local.begin(); i != a.local.end(); ++i) std::cout << "(" << int(*i) << ", " << int(a.global[*i]) << "), "; std::cout << "c: " << a.s << "]"; } #endif template bool assignment_conflict(const assignment &a, const I_t &I, var_t v, reg_t r) { varset_t::const_iterator i, i_end; for (i = a.local.begin(), i_end = a.local.end(); i != i_end; ++i) { if (a.global[*i] != r) continue; if (boost::edge(*i, v, I).second) return(true); } return(false); } template void assignments_introduce_instruction(assignment_list_t &alist, unsigned short int i, const G_t &G) { assignment_list_t::iterator ai, ai_end; #if !defined(_MSC_VER) // Efficient code - reduces total SDCC runtime by about 5.5% vs. code below, but doesn't work with MSVC++ (at least up to MSVC++ 2015) struct inserter_t { explicit inserter_t(const std::vector& g, i_assignment_t& a) : global(g), ia(a) { } inserter_t& operator=(var_t v) { if (global[v] >= 0) ia.add_var(v, global[v]); return(*this); } inserter_t& operator*() { return(*this); } inserter_t& operator++() { return(*this); } inserter_t& operator++(int i) { i; return(*this); } private: const std::vector& global; i_assignment_t& ia; }; for (ai = alist.begin(), ai_end = alist.end(); ai != ai_end; ++ai) { i_assignment_t ia; std::set_intersection(ai->local.begin(), ai->local.end(), G[i].alive.begin(), G[i].alive.end(), inserter_t(ai->global, ia)); ai->i_assignment = ia; } #else // Human-readable code for (ai = alist.begin(), ai_end = alist.end(); ai != ai_end; ++ai) { varset_t i_variables; std::set_intersection(ai->local.begin(), ai->local.end(), G[i].alive.begin(), G[i].alive.end(), std::inserter(i_variables, i_variables.end())); i_assignment_t ia; varset_t::const_iterator v, v_end; for (v = i_variables.begin(), v_end = i_variables.end(); v != v_end; ++v) if (ai->global[*v] >= 0) ia.add_var(*v, ai->global[*v]); ai->i_assignment = ia; } #endif } template static void assignments_introduce_variable(assignment_list_t &alist, unsigned short int i, short int v, const G_t &G, const I_t &I) { assignment_list_t::iterator ai; bool a_initialized; assignment a; size_t c, c_end; for (ai = alist.begin(), c = 0, c_end = alist.size(); c < c_end; c++, ai++) { a_initialized = false; for (reg_t r = 0; r < port->num_regs; r++) { if (!assignment_conflict(*ai, I, v, r)) { if(!a_initialized) { a = *ai; ai->marked = true; a.marked = false; varset_t::iterator i = std::lower_bound(a.local.begin(), a.local.end(), v); if (i == a.local.end() || *i != v) a.local.insert(i, v); } a.global[v] = r; a.i_assignment.add_var(v, r); if(!assignment_hopeless(a, i, G, I, v)) alist.push_back(a); a.i_assignment.remove_var(v); } } } } struct assignment_rep { assignment_list_t::iterator i; float s; bool operator<(const assignment_rep& a) const { return(s < a.s); } }; template float compability_cost(const assignment& a, const assignment& ac, const I_t &I) { typedef typename boost::graph_traits::adjacency_iterator adjacency_iter_t; float c = 0.0f; varset_t::const_iterator vi, vi_end; for(vi = ac.local.begin(), vi_end = ac.local.end(); vi != vi_end; ++vi) { const var_t v = *vi; if(a.global[v] != ac.global[v]) { c += 1000.0f; continue; } #if 0 // This improves the quality of assignments, but it has a big runtime overhead for some cases. adjacency_iter_t j, j_end; for (boost::tie(j, j_end) = adjacent_vertices(v, I); j != j_end; ++j) if(ac.global[v] != -1 && a.global[*j] == ac.global[v]) { c += 1000.0f; break; } #endif } return(c); } // Ensure that we never get more than options.max_allocs_per_node assignments at a single node of the tree decomposition. // Tries to drop the worst ones first (but never drop the empty assignment, as it's the only one guaranteed to be always valid). template static void drop_worst_assignments(assignment_list_t &alist, unsigned short int i, const G_t &G, const I_t &I, const assignment& ac, bool *const assignment_optimal) { unsigned int n; size_t alist_size; assignment_list_t::iterator ai, an; if ((alist_size = alist.size()) * port->num_regs <= static_cast(options.max_allocs_per_node) || alist_size <= 1) return; *assignment_optimal = false; #ifdef DEBUG_RALLOC_DEC std::cout << "Too many assignments here (" << i << "):" << alist_size << " > " << options.max_allocs_per_node / port->num_regs << ". Dropping some.\n"; std::cout.flush(); #endif #if 0 assignment_rep *arep = new assignment_rep[alist_size]; for (n = 0, ai = alist.begin(); n < alist_size; ++ai, n++) { arep[n].i = ai; arep[n].s = ai->s + rough_cost_estimate(*ai, i, G, I) + compability_cost(*ai, ac, I); } std::nth_element(arep + 1, arep + options.max_allocs_per_node / port->num_regs, arep + alist_size); //std::cout << "nth elem. est. cost: " << arep[options.max_allocs_per_node / port->num_regs].s << "\n"; std::cout.flush(); for (n = options.max_allocs_per_node / port->num_regs + 1; n < alist_size; n++) alist.erase(arep[n].i); #else // More efficient, reduces total SDCC runtime by about 1%. size_t endsize = options.max_allocs_per_node / port->num_regs + 1; size_t arep_maxsize = std::min(alist_size, endsize * 2) + 1; size_t m, k; float bound = std::numeric_limits::infinity(); assignment_rep *arep = new assignment_rep[arep_maxsize]; for(m = 0, n = 1, ai = alist.begin(), ++ai; n < alist_size; n++) { float s = ai->s; if(s > bound) { alist.erase(ai++); continue; } s += compability_cost(*ai, ac, I); if(s > bound) { alist.erase(ai++); continue; } s += rough_cost_estimate(*ai, i, G, I); if(s > bound) { alist.erase(ai++); continue; } if(m >= arep_maxsize - 1) { std::nth_element(arep, arep + (endsize - 1), arep + m); for(k = endsize; k < m; k++) alist.erase(arep[k].i); bound = arep[endsize - 1].s; m = endsize; } arep[m].i = ai; arep[m].s = s; m++; ++ai; } std::nth_element(arep, arep + (endsize - 1), arep + m); for (n = endsize; n < m; n++) alist.erase(arep[n].i); #endif delete[] arep; } // Handle Leaf nodes in the nice tree decomposition template static void tree_dec_ralloc_leaf(T_t &T, typename boost::graph_traits::vertex_descriptor t, const G_t &G, const I_t &I) { #ifdef DEBUG_RALLOC_DEC std::cout << "Leaf (" << t << "):\n"; std::cout.flush(); #endif assignment a; assignment_list_t &alist = T[t].assignments; a.s = 0; a.global.resize(boost::num_vertices(I), -1); alist.push_back(a); #ifdef DEBUG_RALLOC_DEC_ASS assignment_list_t::iterator ai; for(ai = alist.begin(); ai != alist.end(); ++ai) { print_assignment(*ai); std::cout << "\n"; } assignment best; get_best_local_assignment(best, t, T); std::cout << "Best: "; print_assignment(best); std::cout << "\n"; #endif } // Handle introduce nodes in the nice tree decomposition template static void tree_dec_ralloc_introduce(T_t &T, typename boost::graph_traits::vertex_descriptor t, const G_t &G, const I_t &I, const assignment& ac, bool *const assignment_optimal) { typedef typename boost::graph_traits::adjacency_iterator adjacency_iter_t; adjacency_iter_t c, c_end; assignment_list_t::iterator ai; boost::tie(c, c_end) = adjacent_vertices(t, T); #ifdef DEBUG_RALLOC_DEC std::cout << "Introduce (" << t << "):\n"; std::cout.flush(); std::cout << "ac: "; print_assignment(ac); std::cout << "\n"; #endif assignment_list_t &alist = T[t].assignments; std::swap(alist, T[*c].assignments); std::set new_vars; std::set_difference(T[t].alive.begin(), T[t].alive.end(), T[*c].alive.begin(), T[*c].alive.end(), std::inserter(new_vars, new_vars.end())); std::set new_inst; std::set_difference(T[t].bag.begin(), T[t].bag.end(), T[*c].bag.begin(), T[*c].bag.end(), std::inserter(new_inst, new_inst.end())); unsigned short int i = *(new_inst.begin()); // Extend to new instruction. assignments_introduce_instruction(alist, i, G); std::set::const_iterator v; for (v = new_vars.begin(); v != new_vars.end(); ++v) { drop_worst_assignments(alist, i, G, I, ac, assignment_optimal); assignments_introduce_variable(alist, i, *v, G, I); } // Summation of costs and early removal of assignments. for (ai = alist.begin(); ai != alist.end();) { if ((ai->s += (ai->i_costs[i] = instruction_cost(*ai, i, G, I))) == std::numeric_limits::infinity()) ai = alist.erase(ai); else ++ai; } // Free memory in the std::set > that live in the assignments in the list. //boost::singleton_pool::release_memory(); #ifdef DEBUG_RALLOC_DEC_ASS for(ai = alist.begin(); ai != alist.end(); ++ai) { print_assignment(*ai); std::cout << "\n"; } assignment best; get_best_local_assignment(best, t, T); std::cout << "Best: "; print_assignment(best); std::cout << "\n"; #endif } static bool assignments_locally_same(const assignment &a1, const assignment &a2) { if (a1.local != a2.local) return(false); varset_t::const_iterator i, i_end; for (i = a1.local.begin(), i_end = a1.local.end(); i != i_end; ++i) if (a1.global[*i] != a2.global[*i]) return(false); return(true); } // Handle forget nodes in the nice tree decomposition template static void tree_dec_ralloc_forget(T_t &T, typename boost::graph_traits::vertex_descriptor t, const G_t &G, const I_t &I) { typedef typename boost::graph_traits::adjacency_iterator adjacency_iter_t; adjacency_iter_t c, c_end; boost::tie(c, c_end) = adjacent_vertices(t, T); #ifdef DEBUG_RALLOC_DEC std::cout << "Forget (" << t << "):\n"; std::cout.flush(); #endif assignment_list_t &alist = T[t].assignments; std::swap(alist, T[*c].assignments); std::set old_inst; std::set_difference(T[*c].bag.begin(), T[*c].bag.end(), T[t].bag.begin(), T[t].bag.end(), std::inserter(old_inst, old_inst.end())); wassert(old_inst.size() == 1); unsigned short int i = *(old_inst.begin()); varset_t old_vars; std::set_difference(T[*c].alive.begin(), T[*c].alive.end(), T[t].alive.begin(), T[t].alive.end(), std::inserter(old_vars, old_vars.end())); assignment_list_t::iterator ai, aif; // Restrict assignments (locally) to current variables. varset_t newlocal; for (ai = alist.begin(); ai != alist.end(); ++ai) { newlocal.clear(); std::set_difference(ai->local.begin(), ai->local.end(), old_vars.begin(), old_vars.end(), std::inserter(newlocal, newlocal.end())); std::swap(ai->local, newlocal); ai->i_costs.erase(i); } alist.sort(); // Collapse (locally) identical assignments. for (ai = alist.begin(); ai != alist.end();) { aif = ai; for (++ai; ai != alist.end() && assignments_locally_same(*aif, *ai);) { if (aif->s > ai->s) { alist.erase(aif); aif = ai; ++ai; } else ai = alist.erase(ai); } } // Free memory in the std::set > that live in the assignments in the list. //boost::singleton_pool::release_memory(); #ifdef DEBUG_RALLOC_DEC std::cout << "Remaining assignments: " << alist.size() << "\n"; std::cout.flush(); #endif #ifdef DEBUG_RALLOC_DEC_ASS for(ai = alist.begin(); ai != alist.end(); ++ai) { print_assignment(*ai); std::cout << "\n"; } assignment best; get_best_local_assignment(best, t, T); std::cout << "Best: "; print_assignment(best); std::cout << "\n"; #endif } // Handle join nodes in the nice tree decomposition template static void tree_dec_ralloc_join(T_t &T, typename boost::graph_traits::vertex_descriptor t, const G_t &G, const I_t &I) { typedef typename boost::graph_traits::adjacency_iterator adjacency_iter_t; adjacency_iter_t c, c_end, c2, c3; boost::tie(c, c_end) = adjacent_vertices(t, T); #ifdef DEBUG_RALLOC_DEC std::cout << "Join (" << t << "):\n"; std::cout.flush(); #endif c2 = c; ++c; c3 = c; assignment_list_t &alist = T[t].assignments; assignment_list_t &alist2 = T[*c2].assignments; std::swap(alist, T[*c3].assignments); alist.sort(); alist2.sort(); assignment_list_t::iterator ai, ai2; for (ai = alist.begin(), ai2 = alist2.begin(); ai != alist.end() && ai2 != alist2.end();) { if (assignments_locally_same(*ai, *ai2)) { ai->s += ai2->s; // Avoid double-counting instruction costs. std::set::iterator bi; for (bi = T[t].bag.begin(); bi != T[t].bag.end(); ++bi) ai->s -= ai->i_costs[*bi]; for (size_t i = 0; i < ai->global.size(); i++) ai->global[i] = ((ai->global[i] != -1) ? ai->global[i] : ai2->global[i]); ++ai; ++ai2; } else if (*ai < *ai2) ai = alist.erase(ai); else if (*ai2 < *ai) ++ai2; } while(ai != alist.end()) ai = alist.erase(ai); alist2.clear(); #ifdef DEBUG_RALLOC_DEC std::cout << "Remaining assignments: " << alist.size() << "\n"; std::cout.flush(); #endif #ifdef DEBUG_RALLOC_DEC_ASS for(std::list::iterator ai = alist.begin(); ai != alist.end(); ++ai) { print_assignment(*ai); std::cout << "\n"; } #endif } template void get_best_local_assignment(assignment &a, typename boost::graph_traits::vertex_descriptor t, const T_t &T) { const assignment_list_t &alist = T[t].assignments; assignment_list_t::const_iterator ai, ai_end, ai_best; for(ai = ai_best = alist.begin(), ai_end = alist.end(); ai != ai_end; ++ai) if(ai->s < ai_best->s) ai_best = ai; a = *ai_best; } // Handle nodes in the tree decomposition, by detecting their type and calling the appropriate function. Recurses. template static void tree_dec_ralloc_nodes(T_t &T, typename boost::graph_traits::vertex_descriptor t, const G_t &G, const I_t &I, const assignment& ac, bool *const assignment_optimal) { typedef typename boost::graph_traits::adjacency_iterator adjacency_iter_t; adjacency_iter_t c, c_end; typename boost::graph_traits::vertex_descriptor c0, c1; boost::tie(c, c_end) = adjacent_vertices(t, T); switch (out_degree(t, T)) { case 0: tree_dec_ralloc_leaf(T, t, G, I); break; case 1: c0 = *c; tree_dec_ralloc_nodes(T, c0, G, I, ac, assignment_optimal); T[c0].bag.size() < T[t].bag.size() ? tree_dec_ralloc_introduce(T, t, G, I, ac, assignment_optimal) : tree_dec_ralloc_forget(T, t, G, I); break; case 2: c0 = *c++; c1 = *c; if (T[c0].weight < T[c1].weight) // Minimize memory consumption needed for keeping intermediate results. As a side effect, this also helps the ac mechanism in the heuristic. std::swap (c0, c1); tree_dec_ralloc_nodes(T, c0, G, I, ac, assignment_optimal); { assignment *ac2 = new assignment; get_best_local_assignment_biased(*ac2, c0, T); tree_dec_ralloc_nodes(T, c1, G, I, *ac2, assignment_optimal); delete ac2; } tree_dec_ralloc_join(T, t, G, I); break; default: std::cerr << "Not nice.\n"; break; } } // Find the best root selecting from t_old and the leafs under t. template static std::pair::vertex_descriptor, size_t> find_best_root(const T_t &T, typename boost::graph_traits::vertex_descriptor t, size_t t_s, typename boost::graph_traits::vertex_descriptor t_old, size_t t_old_s) { typedef typename boost::graph_traits::adjacency_iterator adjacency_iter_t; adjacency_iter_t c, c_end; typename boost::graph_traits::vertex_descriptor c0, c1, t0; size_t t0_s; boost::tie(c, c_end) = adjacent_vertices(t, T); switch (out_degree(t, T)) { case 0: return(t_s > t_old_s ? std::pair::vertex_descriptor, size_t>(t, t_s) : std::pair::vertex_descriptor, size_t>(t_old, t_old_s)); case 1: return(find_best_root(T, *c, T[*c].alive.size() ? T[*c].alive.size() : t_s, t_old, t_old_s)); case 2: c0 = *c++; c1 = *c; boost::tie(t0, t0_s) = find_best_root(T, c0, T[c0].alive.size() ? T[c0].alive.size() : t_s, t_old, t_old_s); return(find_best_root(T, c1, T[c1].alive.size() ? T[c1].alive.size() : t_s, t0_s > t_old_s ? t0 : t_old, t0_s > t_old_s ? t0_s : t_old_s)); break; default: std::cerr << "Not nice.\n"; break; } return(std::pair::vertex_descriptor, size_t>(t_old, t_old_s)); } // Change the root to t. template static void re_root(T_t &T, typename boost::graph_traits::vertex_descriptor t) { typename boost::graph_traits::vertex_descriptor s0, s1, s2; typename boost::graph_traits::in_edge_iterator e, e_end; boost::tie(e, e_end) = boost::in_edges(t, T); if (e == e_end) return; s0 = t; s1 = boost::source(*e, T); for (boost::tie(e, e_end) = boost::in_edges(s1, T); e != e_end; boost::tie(e, e_end) = boost::in_edges(s1, T)) { s2 = boost::source(*e, T); boost::remove_edge(s1, s0, T); boost::add_edge(s0, s1, T); s0 = s1; s1 = s2; } boost::remove_edge(s1, s0, T); boost::add_edge(s0, s1, T); } // Change the root to improve the assignment removal heuristic. template static void good_re_root(T_t &T) { typename boost::graph_traits::vertex_descriptor t; typedef typename boost::graph_traits::adjacency_iterator adjacency_iter_t; adjacency_iter_t c, c_end; t = find_root(T); for (boost::tie(c, c_end) = boost::adjacent_vertices(t, T); c != c_end && !T[*c].alive.size();) boost::tie(c, c_end) = boost::adjacent_vertices(*c, T); size_t t_s = (c != c_end ? T[*c].alive.size() : 0); t = find_best_root(T, t, t_s, t, t_s).first; if (T[t].alive.size()) { std::cerr << "Error: Invalid root.\n"; return; } re_root(T, t); } // Dump conflict graph, with numbered nodes, show live variables at each node. static void dump_con(const con_t &con) { if (!currFunc) return; std::ofstream dump_file((std::string(dstFileName) + ".dumpralloccon" + currFunc->rname + ".dot").c_str()); std::string *name = new std::string[num_vertices(con)]; for (var_t i = 0; static_cast::vertices_size_type>(i) < boost::num_vertices(con); i++) { std::ostringstream os; os << i; if (con[i].name) os << " : " << con[i].name << ":" << con[i].byte; name[i] = os.str(); } boost::write_graphviz(dump_file, con, boost::make_label_writer(name)); delete[] name; } // Dump cfg, with numbered nodes, show live variables at each node. static void dump_cfg(const cfg_t &cfg) { if (!currFunc) return; std::ofstream dump_file((std::string(dstFileName) + ".dumpralloccfg" + currFunc->rname + ".dot").c_str()); std::string *name = new std::string[num_vertices(cfg)]; for (unsigned int i = 0; i < boost::num_vertices(cfg); i++) { std::ostringstream os; os << i << ", " << cfg[i].ic->key << ": "; cfg_alive_t::const_iterator v; for (v = cfg[i].alive.begin(); v != cfg[i].alive.end(); ++v) os << *v << " "; name[i] = os.str(); } boost::write_graphviz(dump_file, cfg, boost::make_label_writer(name), boost::default_writer(), cfg_titlewriter(currFunc->rname, "register allocator")); delete[] name; } // Dump tree decomposition, show bag and live variables at each node. static void dump_tree_decomposition(const tree_dec_t &tree_dec) { if (!currFunc) return; std::ofstream dump_file((std::string(dstFileName) + ".dumprallocdec" + currFunc->rname + ".dot").c_str()); unsigned int w = 0; std::string *name = new std::string[num_vertices(tree_dec)]; for (unsigned int i = 0; i < boost::num_vertices(tree_dec); i++) { if (tree_dec[i].bag.size() > w) w = tree_dec[i].bag.size(); std::ostringstream os; std::set::const_iterator v1; os << i << " | "; for (v1 = tree_dec[i].bag.begin(); v1 != tree_dec[i].bag.end(); ++v1) os << *v1 << " "; os << ": "; std::set::const_iterator v2; for (v2 = tree_dec[i].alive.begin(); v2 != tree_dec[i].alive.end(); ++v2) os << *v2 << " "; name[i] = os.str(); } boost::write_graphviz(dump_file, tree_dec, boost::make_label_writer(name), boost::default_writer(), dec_titlewriter(w - 1, currFunc->rname, "register allocator")); delete[] name; #ifdef D_RALLOC_DEC std::cout << "Width: " << (w - 1) << "(" << currFunc->name << ")\n"; #endif } #endif