From 3f625689acd570e4f14cc2ebaa43a425d13954ff Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Thu, 31 Mar 2011 13:49:33 +0000 Subject: nv50: copy regalloc fixes from nvc0 Should fix gnome-shell's fade shader. Unification of the shader backend which is supposed to remove the code duplication is still WIP. --- diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index e6f3815..a9a3248 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -228,6 +228,8 @@ struct nv_ref { ubyte flags; /* not used yet */ }; +#define NV_REF_FLAG_REGALLOC_PRIV (1 << 0) + struct nv_basic_block; struct nv_instruction { @@ -263,6 +265,15 @@ struct nv_instruction { ubyte quadop; }; +static INLINE int +nvi_vector_size(struct nv_instruction *nvi) +{ + int i; + assert(nvi); + for (i = 0; i < 4 && nvi->def[i]; ++i); + return i; +} + #define CFG_EDGE_FORWARD 0 #define CFG_EDGE_BACK 1 #define CFG_EDGE_LOOP_ENTER 2 diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c index 39ae366..657df2c 100644 --- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -32,14 +32,39 @@ #include "util/u_simple_list.h" #define NUM_REGISTER_FILES 4 +#define MAX_REGISTER_COUNT 256 struct register_set { struct nv_pc *pc; uint32_t last[NUM_REGISTER_FILES]; - uint32_t bits[NUM_REGISTER_FILES][8]; + uint32_t bits[NUM_REGISTER_FILES][(MAX_REGISTER_COUNT + 31) / 32]; }; +/* using OR because a set bit means occupied/unavailable, aliasing is allowed */ +static void +intersect_register_sets(struct register_set *dst, + struct register_set *src1, struct register_set *src2) +{ + int i, j; + + for (i = 0; i < NUM_REGISTER_FILES; ++i) { + for (j = 0; j < (MAX_REGISTER_COUNT + 31) / 32; ++j) + dst->bits[i][j] = src1->bits[i][j] | src2->bits[i][j]; + } +} + +static void +mask_register_set(struct register_set *set, uint32_t mask, uint32_t umask) +{ + int i, j; + + for (i = 0; i < NUM_REGISTER_FILES; ++i) { + for (j = 0; j < (MAX_REGISTER_COUNT + 31) / 32; ++j) + set->bits[i][j] = (set->bits[i][j] | mask) & umask; + } +} + struct nv_pc_pass { struct nv_pc *pc; @@ -61,11 +86,15 @@ ranges_coalesce(struct nv_range *range) } } +/* @return: TRUE if @new_range can be freed (i.e. was not reused) */ static boolean add_range_ex(struct nv_value *val, int bgn, int end, struct nv_range *new_range) { struct nv_range *range, **nextp = &val->livei; + if (bgn == end) /* [a, a) is invalid / empty */ + return TRUE; + for (range = val->livei; range; range = range->next) { if (end < range->bgn) break; /* insert before */ @@ -251,6 +280,8 @@ reg_occupy(struct register_set *set, struct nv_value *val) id <<= s; m = (1 << (1 << s)) - 1; + assert(s >= 0); /* XXX: remove me */ + set->bits[f][id / 32] |= m << (id % 32); if (set->pc->max_reg[f] < id) @@ -286,15 +317,12 @@ join_allowed(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) if (a->join->reg.id == b->join->reg.id) return TRUE; -#if 1 /* either a or b or both have been assigned */ if (a->join->reg.id >= 0 && b->join->reg.id >= 0) return FALSE; else if (b->join->reg.id >= 0) { - if (a->join->reg.id >= 0) - return FALSE; val = a; a = b; b = val; @@ -309,8 +337,6 @@ join_allowed(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) return FALSE; } return TRUE; -#endif - return FALSE; } static INLINE void @@ -336,14 +362,14 @@ do_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) assert(b->join == a->join); } -static INLINE void +static INLINE boolean try_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) { if (!join_allowed(ctx, a, b)) { #ifdef NV50_RA_DEBUG_JOIN debug_printf("cannot join %i to %i: not allowed\n", b->n, a->n); #endif - return; + return FALSE; } if (livei_have_overlap(a->join, b->join)) { #ifdef NV50_RA_DEBUG_JOIN @@ -351,10 +377,27 @@ try_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) livei_print(a); livei_print(b); #endif - return; + return FALSE; } do_join_values(ctx, a, b); + + return TRUE; +} + +static void +join_values_nofail(struct nv_pc_pass *ctx, + struct nv_value *a, struct nv_value *b, boolean type_only) +{ + if (type_only) { + assert(join_allowed(ctx, a, b)); + do_join_values(ctx, a, b); + } else { + boolean ok = try_join_values(ctx, a, b); + if (!ok) { + NOUVEAU_ERR("failed to coalesce values\n"); + } + } } static INLINE boolean @@ -369,20 +412,32 @@ need_new_else_block(struct nv_basic_block *b, struct nv_basic_block *p) return (b->num_in > 1) && (n == 2); } +/* Look for the @phi's operand whose definition reaches @b. */ static int phi_opnd_for_bb(struct nv_instruction *phi, struct nv_basic_block *b, struct nv_basic_block *tb) { + struct nv_ref *srci, *srcj; int i, j; - for (j = -1, i = 0; i < 4 && phi->src[i]; ++i) { - if (!nvbb_reachable_by(b, phi->src[i]->value->insn->bb, tb)) + for (j = -1, i = 0; i < 6 && phi->src[i]; ++i) { + srci = phi->src[i]; + /* if already replaced, check with original source first */ + if (srci->flags & NV_REF_FLAG_REGALLOC_PRIV) + srci = srci->value->insn->src[0]; + if (!nvbb_reachable_by(b, srci->value->insn->bb, NULL)) continue; /* NOTE: back-edges are ignored by the reachable-by check */ - if (j < 0 || !nvbb_reachable_by(phi->src[j]->value->insn->bb, - phi->src[i]->value->insn->bb, tb)) + if (j < 0 || !nvbb_reachable_by(srcj->value->insn->bb, + srci->value->insn->bb, NULL)) { j = i; + srcj = srci; + } } + if (j >= 0 && nvbb_reachable_by(b, phi->def[0]->insn->bb, NULL)) + if (!nvbb_reachable_by(srcj->value->insn->bb, + phi->def[0]->insn->bb, NULL)) + j = -1; return j; } @@ -429,16 +484,21 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) ctx->pc->current_block = pn; for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) { - if ((j = phi_opnd_for_bb(i, p, b)) < 0) - continue; - val = i->src[j]->value; - - if (i->src[j]->flags) { - val = val->insn->src[0]->value; - while (j < 4 && i->src[j]) - ++j; - assert(j < 4); + j = phi_opnd_for_bb(i, p, b); + + if (j < 0) { + val = i->def[0]; + } else { + val = i->src[j]->value; + if (i->src[j]->flags & NV_REF_FLAG_REGALLOC_PRIV) { + j = -1; + /* use original value, we already encountered & replaced it */ + val = val->insn->src[0]->value; + } } + if (j < 0) /* need an additional source ? */ + for (j = 0; j < 5 && i->src[j] && i->src[j]->value != val; ++j); + assert(j < 5); ni = new_instruction(ctx->pc, NV_OP_MOV); @@ -452,7 +512,7 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) nv_reference(ctx->pc, &i->src[j], ni->def[0]); - i->src[j]->flags = 1; + i->src[j]->flags |= NV_REF_FLAG_REGALLOC_PRIV; } if (pn != p && pn->exit) { @@ -470,45 +530,50 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) return 0; } +#define JOIN_MASK_PHI (1 << 0) +#define JOIN_MASK_SELECT (1 << 1) +#define JOIN_MASK_MOV (1 << 2) +#define JOIN_MASK_TEX (1 << 3) + static int -pass_join_values(struct nv_pc_pass *ctx, int iter) +pass_join_values(struct nv_pc_pass *ctx, unsigned mask) { int c, n; for (n = 0; n < ctx->num_insns; ++n) { - struct nv_instruction *i = ctx->insns[n]; + struct nv_instruction *nvi, *i = ctx->insns[n]; switch (i->opcode) { case NV_OP_PHI: - if (iter != 2) + if (!(mask & JOIN_MASK_PHI)) break; - for (c = 0; c < 4 && i->src[c]; ++c) - try_join_values(ctx, i->def[0], i->src[c]->value); + for (c = 0; c < 5 && i->src[c]; ++c) + join_values_nofail(ctx, i->def[0], i->src[c]->value, FALSE); break; case NV_OP_MOV: - if ((iter == 2) && i->src[0]->value->insn && - !nv_is_vector_op(i->src[0]->value->join->insn->opcode)) + if (!(mask & JOIN_MASK_MOV)) + break; + nvi = i->src[0]->value->join->insn; + if (nvi && !nv_is_vector_op(nvi->opcode)) try_join_values(ctx, i->def[0], i->src[0]->value); break; case NV_OP_SELECT: - if (iter != 1) + if (!(mask & JOIN_MASK_SELECT)) break; - for (c = 0; c < 4 && i->src[c]; ++c) { - assert(join_allowed(ctx, i->def[0], i->src[c]->value)); - do_join_values(ctx, i->def[0], i->src[c]->value); - } + for (c = 0; c < 5 && i->src[c]; ++c) + join_values_nofail(ctx, i->def[0], i->src[c]->value, TRUE); break; case NV_OP_TEX: case NV_OP_TXB: case NV_OP_TXL: case NV_OP_TXQ: - if (iter) + if (!(mask & JOIN_MASK_TEX)) break; - for (c = 0; c < 4; ++c) { - if (!i->src[c]) - break; - do_join_values(ctx, i->def[c], i->src[c]->value); - } + /* This should work without conflicts because we always generate + * extra MOVs for the sources of a TEX. + */ + for (c = 0; c < 4 && i->src[c]; ++c) + join_values_nofail(ctx, i->def[c], i->src[c]->value, TRUE); break; default: break; @@ -643,15 +708,15 @@ static void collect_live_values(struct nv_basic_block *b, const int n) { int i; - if (b->out[0]) { - if (b->out[1]) { /* what to do about back-edges ? */ + if (b->out[0] && b->out_kind[0] != CFG_EDGE_FAKE) { + if (b->out[1] && b->out_kind[1] != CFG_EDGE_FAKE) { for (i = 0; i < n; ++i) b->live_set[i] = b->out[0]->live_set[i] | b->out[1]->live_set[i]; } else { memcpy(b->live_set, b->out[0]->live_set, n * sizeof(uint32_t)); } } else - if (b->out[1]) { + if (b->out[1] && b->out_kind[1] != CFG_EDGE_FAKE) { memcpy(b->live_set, b->out[1]->live_set, n * sizeof(uint32_t)); } else { memset(b->live_set, 0, n * sizeof(uint32_t)); @@ -770,8 +835,8 @@ insert_ordered_tail(struct nv_value *list, struct nv_value *nval) struct nv_value *elem; for (elem = list->prev; - elem != list && elem->livei->bgn > nval->livei->bgn; - elem = elem->prev); + elem != list && elem->livei->bgn > nval->livei->bgn; + elem = elem->prev); /* now elem begins before or at the same time as val */ nval->prev = elem; @@ -780,44 +845,49 @@ insert_ordered_tail(struct nv_value *list, struct nv_value *nval) elem->next = nval; } -static int -pass_linear_scan(struct nv_pc_pass *ctx, int iter) +static void +collect_register_values(struct nv_pc_pass *ctx, struct nv_value *head, + boolean assigned_only) { - struct nv_instruction *i; - struct register_set f, free; + struct nv_value *val; int k, n; - struct nv_value *cur, *val, *tmp[2]; - struct nv_value active, inactive, handled, unhandled; - make_empty_list(&active); - make_empty_list(&inactive); - make_empty_list(&handled); - make_empty_list(&unhandled); - - nv50_ctor_register_set(ctx->pc, &free); + make_empty_list(head); - /* joined values should have range = NULL and thus not be added; - * also, fixed memory values won't be added because they're not - * def'd, just used - */ for (n = 0; n < ctx->num_insns; ++n) { - i = ctx->insns[n]; + struct nv_instruction *i = ctx->insns[n]; + /* for joined values, only the representative will have livei != NULL */ for (k = 0; k < 4; ++k) { if (i->def[k] && i->def[k]->livei) - insert_ordered_tail(&unhandled, i->def[k]); - else - if (0 && i->def[k]) - debug_printf("skipping def'd value %i: no livei\n", i->def[k]->n); + if (!assigned_only || i->def[k]->reg.id >= 0) + insert_ordered_tail(head, i->def[k]); } if (i->flags_def && i->flags_def->livei) - insert_ordered_tail(&unhandled, i->flags_def); + if (!assigned_only || i->flags_def->reg.id >= 0) + insert_ordered_tail(head, i->flags_def); } - for (val = unhandled.next; val != unhandled.prev; val = val->next) { + for (val = head->next; val != head->prev; val = val->next) { assert(val->join == val); assert(val->livei->bgn <= val->next->livei->bgn); } +} + +static int +pass_linear_scan(struct nv_pc_pass *ctx, int iter) +{ + struct register_set f, free; + struct nv_value *cur, *val, *tmp[2]; + struct nv_value active, inactive, handled, unhandled; + + make_empty_list(&active); + make_empty_list(&inactive); + make_empty_list(&handled); + + nv50_ctor_register_set(ctx->pc, &free); + + collect_register_values(ctx, &unhandled, FALSE); foreach_s(cur, tmp[0], &unhandled) { remove_from_list(cur); @@ -854,13 +924,7 @@ pass_linear_scan(struct nv_pc_pass *ctx, int iter) reg_occupy(&f, val); if (cur->reg.id < 0) { - boolean mem = FALSE; - - if (nv_is_vector_op(cur->insn->opcode)) - mem = !reg_assign(&f, &cur->insn->def[0], 4); - else - if (iter) - mem = !reg_assign(&f, &cur, 1); + boolean mem = !reg_assign(&f, &cur, 1); if (mem) { NOUVEAU_ERR("out of registers\n"); @@ -874,6 +938,67 @@ pass_linear_scan(struct nv_pc_pass *ctx, int iter) return 0; } +/* Allocate values defined by instructions such as TEX, which have to be + * assigned to consecutive registers. + * Linear scan doesn't really work here since the values can have different + * live intervals. + */ +static int +pass_allocate_constrained_values(struct nv_pc_pass *ctx) +{ + struct nv_value regvals, *val; + struct nv_instruction *i; + struct nv_value *defs[4]; + struct register_set regs[4]; + int n, vsize, c; + uint32_t mask; + boolean mem; + + collect_register_values(ctx, ®vals, TRUE); + + for (n = 0; n < ctx->num_insns; ++n) { + i = ctx->insns[n]; + vsize = nvi_vector_size(i); + if (!(vsize > 1)) + continue; + assert(vsize <= 4); + for (c = 0; c < vsize; ++c) + defs[c] = i->def[c]->join; + + if (defs[0]->reg.id >= 0) { + for (c = 1; c < vsize; ++c) + assert(defs[c]->reg.id >= 0); + continue; + } + + for (c = 0; c < vsize; ++c) { + nv50_ctor_register_set(ctx->pc, ®s[c]); + + foreach(val, ®vals) { + if (val->reg.id >= 0 && livei_have_overlap(val, defs[c])) + reg_occupy(®s[c], val); + } + mask = 0x11111111; + if (vsize == 2) /* granularity is 2 and not 4 */ + mask |= 0x11111111 << 2; + mask_register_set(®s[c], 0, mask << c); + + if (defs[c]->livei) + insert_ordered_tail(®vals, defs[c]); + } + for (c = 1; c < vsize; ++c) + intersect_register_sets(®s[0], ®s[0], ®s[c]); + + mem = !reg_assign(®s[0], &defs[0], vsize); + + if (mem) { + NOUVEAU_ERR("out of registers\n"); + abort(); + } + } + return 0; +} + static int nv_pc_pass1(struct nv_pc *pc, struct nv_basic_block *root) { @@ -923,16 +1048,16 @@ nv_pc_pass1(struct nv_pc *pc, struct nv_basic_block *root) livei_print(&pc->values[i]); #endif - ret = pass_join_values(ctx, 0); + ret = pass_join_values(ctx, JOIN_MASK_PHI); if (ret) goto out; - ret = pass_linear_scan(ctx, 0); + ret = pass_join_values(ctx, JOIN_MASK_SELECT | JOIN_MASK_TEX); if (ret) goto out; - ret = pass_join_values(ctx, 1); + ret = pass_join_values(ctx, JOIN_MASK_MOV); if (ret) goto out; - ret = pass_join_values(ctx, 2); + ret = pass_allocate_constrained_values(ctx); if (ret) goto out; ret = pass_linear_scan(ctx, 1); -- cgit v0.8.3-6-g21f6