From caf91f32fd55b52d6448ac1aaf4b746790687779 Mon Sep 17 00:00:00 2001 From: Mike Bauer Date: Tue, 30 Nov 2021 22:28:55 -0800 Subject: [PATCH 01/36] legion: refactor expression reference counting --- runtime/legion/region_tree.cc | 96 ++++++++++++++++++++++++---------- runtime/legion/region_tree.h | 35 ++++++++++--- runtime/legion/region_tree.inl | 24 ++++----- 3 files changed, 109 insertions(+), 46 deletions(-) diff --git a/runtime/legion/region_tree.cc b/runtime/legion/region_tree.cc index 3a5c60bb7c..17fe7a4d3a 100644 --- a/runtime/legion/region_tree.cc +++ b/runtime/legion/region_tree.cc @@ -6427,7 +6427,7 @@ namespace Legion { { const TightenIndexSpaceArgs *targs = (const TightenIndexSpaceArgs*)args; targs->proxy_this->tighten_index_space(); - if (targs->proxy_this->remove_expression_reference(true/*tree only*/)) + if (targs->proxy_this->remove_expression_tree_reference()) delete targs->proxy_this; } @@ -6481,7 +6481,7 @@ namespace Legion { // forest has given us a reference back on it, see if we're the first // ones to write it, if not we can remove the reference now if (!__sync_bool_compare_and_swap(&canonical, NULL, expr)) - expr->remove_expression_reference(true/*tree*/); + expr->remove_expression_tree_reference(); return expr; } @@ -6577,7 +6577,7 @@ namespace Legion { //-------------------------------------------------------------------------- { // We always keep a reference on ourself until we get invalidated - add_expression_reference(true/*expr tree*/); + add_expression_tree_reference(); #ifdef LEGION_GC log_garbage.info("GC Index Expr %lld %d %lld", LEGION_DISTRIBUTED_ID_FILTER(this->did), local_space, expr_id); @@ -6651,27 +6651,48 @@ namespace Legion { } //-------------------------------------------------------------------------- - void IndexSpaceOperation::add_expression_reference(bool expr_tree) + void IndexSpaceOperation::add_expression_reference( + std::set &applied_events, unsigned count) //-------------------------------------------------------------------------- { - if (!expr_tree) + WrapperReferenceMutator mutator(applied_events); + add_expression_reference(&mutator, count); + } + + //-------------------------------------------------------------------------- + void IndexSpaceOperation::add_expression_reference( + ReferenceMutator *mutator, unsigned count) + //-------------------------------------------------------------------------- + { + if (mutator == NULL) { - LocalReferenceMutator mutator; - add_base_gc_ref(IS_EXPR_REF, &mutator); + LocalReferenceMutator local_mutator; + add_base_gc_ref(IS_EXPR_REF, &local_mutator, count); } else - add_base_resource_ref(IS_EXPR_REF); + add_base_gc_ref(IS_EXPR_REF, mutator, count); } //-------------------------------------------------------------------------- - bool IndexSpaceOperation::remove_expression_reference(bool expr_tree) + bool IndexSpaceOperation::remove_expression_reference(unsigned count) //-------------------------------------------------------------------------- { - if (expr_tree) - return remove_base_resource_ref(IS_EXPR_REF); - else - return remove_base_gc_ref(IS_EXPR_REF); - } + return remove_base_gc_ref(IS_EXPR_REF, NULL/*mutator*/, count); + } + + //-------------------------------------------------------------------------- + void IndexSpaceOperation::add_expression_tree_reference(unsigned count) + //-------------------------------------------------------------------------- + { + add_base_resource_ref(IS_EXPR_REF, count); + } + + //-------------------------------------------------------------------------- + bool IndexSpaceOperation::remove_expression_tree_reference(unsigned count) + //-------------------------------------------------------------------------- + { + return remove_base_resource_ref(IS_EXPR_REF, count); + } //-------------------------------------------------------------------------- void IndexSpaceOperation::invalidate_operation( @@ -7461,7 +7482,7 @@ namespace Legion { parent_operations.begin(); it != parent_operations.end(); it++, idx++) { - (*it)->add_expression_reference(true/*expr tree*/); + (*it)->add_expression_tree_reference(); parents[idx] = (*it); } } @@ -7472,7 +7493,7 @@ namespace Legion { // Remove any references that we have on the parents for (std::vector::const_iterator it = parents.begin(); it != parents.end(); it++) - if ((*it)->remove_expression_reference(true/*expr tree*/)) + if ((*it)->remove_expression_tree_reference()) delete (*it); } } @@ -8648,33 +8669,54 @@ namespace Legion { } //-------------------------------------------------------------------------- - void IndexSpaceNode::add_expression_reference(bool expr_tree) + void IndexSpaceNode::add_expression_reference( + std::set &applied_events, unsigned count) //-------------------------------------------------------------------------- { - if (!expr_tree) + WrapperReferenceMutator mutator(applied_events); + add_expression_reference(&mutator, count); + } + + //-------------------------------------------------------------------------- + void IndexSpaceNode::add_expression_reference( + ReferenceMutator *mutator, unsigned count) + //-------------------------------------------------------------------------- + { + if (mutator == NULL) { - LocalReferenceMutator mutator; - add_base_valid_ref(IS_EXPR_REF, &mutator); + LocalReferenceMutator local_mutator; + add_base_gc_ref(IS_EXPR_REF, &local_mutator, count); } else - add_base_resource_ref(IS_EXPR_REF); + add_base_gc_ref(IS_EXPR_REF, mutator, count); } //-------------------------------------------------------------------------- - bool IndexSpaceNode::remove_expression_reference(bool expr_tree) + bool IndexSpaceNode::remove_expression_reference(unsigned count) //-------------------------------------------------------------------------- { - if (expr_tree) - return remove_base_resource_ref(IS_EXPR_REF); - else - return remove_base_valid_ref(IS_EXPR_REF); + return remove_base_gc_ref(IS_EXPR_REF, NULL/*mutator*/, count); + } + + //-------------------------------------------------------------------------- + void IndexSpaceNode::add_expression_tree_reference(unsigned count) + //-------------------------------------------------------------------------- + { + add_base_resource_ref(IS_EXPR_REF, count); + } + + //-------------------------------------------------------------------------- + bool IndexSpaceNode::remove_expression_tree_reference(unsigned count) + //-------------------------------------------------------------------------- + { + return remove_base_resource_ref(IS_EXPR_REF, count); } //-------------------------------------------------------------------------- bool IndexSpaceNode::remove_operation(RegionTreeForest *forest) //-------------------------------------------------------------------------- { - return remove_expression_reference(true/*expr tree*/); + return remove_expression_tree_reference(); } //-------------------------------------------------------------------------- diff --git a/runtime/legion/region_tree.h b/runtime/legion/region_tree.h index 74c971a145..2abce33a6f 100644 --- a/runtime/legion/region_tree.h +++ b/runtime/legion/region_tree.h @@ -1028,7 +1028,7 @@ namespace Legion { TightenIndexSpaceArgs(IndexSpaceExpression *proxy) : LgTaskArgs(implicit_provenance), proxy_this(proxy) - { proxy->add_expression_reference(true/*tree only*/); } + { proxy->add_expression_tree_reference(); } public: IndexSpaceExpression *const proxy_this; }; @@ -1091,10 +1091,17 @@ namespace Legion { virtual void pack_expression(Serializer &rez, AddressSpaceID target) = 0; virtual void pack_expression_value(Serializer &rez, AddressSpaceID target) = 0; + public: virtual bool try_add_canonical_reference(void) = 0; virtual bool remove_canonical_reference(void) = 0; - virtual void add_expression_reference(bool expr_tree = false) = 0; - virtual bool remove_expression_reference(bool expr_tree = false) = 0; + virtual void add_expression_reference(std::set &applied_events, + unsigned count = 1) = 0; + virtual void add_expression_reference(ReferenceMutator *mutator = NULL, + unsigned count = 1) = 0; + virtual bool remove_expression_reference(unsigned count = 1) = 0; + virtual void add_expression_tree_reference(unsigned count = 1) = 0; + virtual bool remove_expression_tree_reference(unsigned count = 1) = 0; + public: virtual bool remove_operation(RegionTreeForest *forest) = 0; virtual IndexSpaceNode* create_node(IndexSpace handle, DistributedID did, RtEvent initialized, @@ -1338,10 +1345,17 @@ namespace Legion { virtual void pack_expression(Serializer &rez, AddressSpaceID target) = 0; virtual void pack_expression_value(Serializer &rez, AddressSpaceID target) = 0; + public: virtual bool try_add_canonical_reference(void); virtual bool remove_canonical_reference(void); - virtual void add_expression_reference(bool expr_tree = false); - virtual bool remove_expression_reference(bool expr_tree = false); + virtual void add_expression_reference(std::set &applied_events, + unsigned count = 1); + virtual void add_expression_reference(ReferenceMutator *mutator = NULL, + unsigned count = 1); + virtual bool remove_expression_reference(unsigned count = 1); + virtual void add_expression_tree_reference(unsigned count = 1); + virtual bool remove_expression_tree_reference(unsigned count = 1); + public: virtual bool remove_operation(RegionTreeForest *forest) = 0; virtual IndexSpaceNode* create_node(IndexSpace handle, DistributedID did, RtEvent initialized, @@ -1946,10 +1960,17 @@ namespace Legion { virtual bool check_empty(void) = 0; virtual void pack_expression(Serializer &rez, AddressSpaceID target); virtual void pack_expression_value(Serializer &rez,AddressSpaceID target); + public: virtual bool try_add_canonical_reference(void); virtual bool remove_canonical_reference(void); - virtual void add_expression_reference(bool expr_tree = false); - virtual bool remove_expression_reference(bool expr_tree = false); + virtual void add_expression_reference(std::set &applied_events, + unsigned count = 1); + virtual void add_expression_reference(ReferenceMutator *mutator = NULL, + unsigned count = 1); + virtual bool remove_expression_reference(unsigned count = 1); + virtual void add_expression_tree_reference(unsigned count = 1); + virtual bool remove_expression_tree_reference(unsigned count = 1); + public: virtual bool remove_operation(RegionTreeForest *forest); virtual IndexSpaceNode* create_node(IndexSpace handle, DistributedID did, RtEvent initialized, diff --git a/runtime/legion/region_tree.inl b/runtime/legion/region_tree.inl index 380b3145bb..3e3e3e2838 100644 --- a/runtime/legion/region_tree.inl +++ b/runtime/legion/region_tree.inl @@ -1876,7 +1876,7 @@ namespace Legion { #endif // Add the parent and the reference sub->add_parent_operation(this); - sub->add_expression_reference(true/*expr tree*/); + sub->add_expression_tree_reference(); // Then get the realm index space expression ApEvent precondition = sub->get_expr_index_space( &spaces[idx], this->type_tag, false/*need tight result*/); @@ -1946,7 +1946,7 @@ namespace Legion { { // Remove references from our sub expressions for (unsigned idx = 0; idx < sub_expressions.size(); idx++) - if (sub_expressions[idx]->remove_expression_reference(true/*exprtree*/)) + if (sub_expressions[idx]->remove_expression_tree_reference()) delete sub_expressions[idx]; } @@ -1997,7 +1997,7 @@ namespace Legion { forest->remove_union_operation(this, sub_expressions); // Remove our expression reference added by invalidate_operation // and return true if we should be deleted - return this->remove_expression_reference(true/*expr tree*/); + return this->remove_expression_tree_reference(); } //-------------------------------------------------------------------------- @@ -2019,7 +2019,7 @@ namespace Legion { #endif // Add the parent and the reference sub->add_parent_operation(this); - sub->add_expression_reference(true/*expr tree*/); + sub->add_expression_tree_reference(); ApEvent precondition = sub->get_expr_index_space( &spaces[idx], this->type_tag, false/*need tight result*/); if (precondition.exists()) @@ -2089,7 +2089,7 @@ namespace Legion { { // Remove references from our sub expressions for (unsigned idx = 0; idx < sub_expressions.size(); idx++) - if (sub_expressions[idx]->remove_expression_reference(true/*exprtree*/)) + if (sub_expressions[idx]->remove_expression_tree_reference()) delete sub_expressions[idx]; } @@ -2141,7 +2141,7 @@ namespace Legion { forest->remove_intersection_operation(this, sub_expressions); // Remove our expression reference added by invalidate_operation // and return true if we should be deleted - return this->remove_expression_reference(true/*expr tree*/); + return this->remove_expression_tree_reference(); } //-------------------------------------------------------------------------- @@ -2160,7 +2160,7 @@ namespace Legion { { // Special case for when the expressions are the same lhs->add_parent_operation(this); - lhs->add_expression_reference(true/*expr tree*/); + lhs->add_expression_tree_reference(); this->realm_index_space = Realm::IndexSpace::make_empty(); this->tight_index_space = Realm::IndexSpace::make_empty(); this->realm_index_space_ready = ApEvent::NO_AP_EVENT; @@ -2172,8 +2172,8 @@ namespace Legion { // Add the parent and the references lhs->add_parent_operation(this); rhs->add_parent_operation(this); - lhs->add_expression_reference(true/*expr tree*/); - rhs->add_expression_reference(true/*expr tree*/); + lhs->add_expression_tree_reference(); + rhs->add_expression_tree_reference(); ApEvent left_ready = lhs->get_expr_index_space(&lhs_space, this->type_tag, false/*tight*/); ApEvent right_ready = @@ -2239,9 +2239,9 @@ namespace Legion { //-------------------------------------------------------------------------- { if ((rhs != NULL) && (lhs != rhs) && - rhs->remove_expression_reference(true/*expr tree*/)) + rhs->remove_expression_tree_reference()) delete rhs; - if ((lhs != NULL) && lhs->remove_expression_reference(true/*expr tree*/)) + if ((lhs != NULL) && lhs->remove_expression_tree_reference()) delete lhs; } @@ -2294,7 +2294,7 @@ namespace Legion { forest->remove_subtraction_operation(this, lhs, rhs); // Remove our expression reference added by invalidate_operation // and return true if we should be deleted - return this->remove_expression_reference(true/*expr tree*/); + return this->remove_expression_tree_reference(); } ///////////////////////////////////////////////////////////// From ddbd82bcc23c684fda5c0fde86a1a738de56bc59 Mon Sep 17 00:00:00 2001 From: Mike Bauer Date: Wed, 1 Dec 2021 02:45:50 -0800 Subject: [PATCH 02/36] legion: encode distributed ID kind for index space expressions --- runtime/legion/region_tree.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime/legion/region_tree.cc b/runtime/legion/region_tree.cc index 17fe7a4d3a..eaef5703bd 100644 --- a/runtime/legion/region_tree.cc +++ b/runtime/legion/region_tree.cc @@ -6570,8 +6570,8 @@ namespace Legion { IndexSpaceOperation::IndexSpaceOperation(TypeTag tag, OperationKind kind, RegionTreeForest *ctx) : IndexSpaceExpression(tag, ctx->runtime, inter_lock), - DistributedCollectable(ctx->runtime, - ctx->runtime->get_available_distributed_id(), + DistributedCollectable(ctx->runtime, LEGION_DISTRIBUTED_HELP_ENCODE( + ctx->runtime->get_available_distributed_id(), INDEX_EXPR_NODE_DC), ctx->runtime->address_space), context(ctx), origin_expr(this), op_kind(kind), invalidated(0) //-------------------------------------------------------------------------- From b2bb4f139a7cf06a12a63f96e7a0bfb611aad735 Mon Sep 17 00:00:00 2001 From: Mike Bauer Date: Wed, 1 Dec 2021 15:40:00 -0800 Subject: [PATCH 03/36] legion: more work on refactoring distributed expressions and tracking references --- runtime/legion/legion_analysis.cc | 44 ++++++----- runtime/legion/legion_analysis.h | 6 +- runtime/legion/legion_instances.cc | 18 +++-- runtime/legion/legion_trace.cc | 16 ++-- runtime/legion/legion_views.cc | 15 +++- runtime/legion/legion_views.h | 3 + runtime/legion/region_tree.cc | 119 ++++++++++++++++++++--------- runtime/legion/region_tree.h | 65 ++++++++++------ runtime/legion/region_tree.inl | 24 +++--- 9 files changed, 199 insertions(+), 111 deletions(-) diff --git a/runtime/legion/legion_analysis.cc b/runtime/legion/legion_analysis.cc index e77f2401f5..7d2e0953ce 100644 --- a/runtime/legion/legion_analysis.cc +++ b/runtime/legion/legion_analysis.cc @@ -80,7 +80,7 @@ namespace Legion { #ifdef DEBUG_LEGION assert(expr != NULL); #endif - expr->add_expression_reference(); + expr->add_base_expression_reference(IS_EXPR_REF); } #else //-------------------------------------------------------------------------- @@ -92,7 +92,7 @@ namespace Legion { #ifdef DEBUG_LEGION assert(expr != NULL); #endif - expr->add_expression_reference(); + expr->add_base_expression_reference(IS_EXPR_REF); } #endif @@ -116,7 +116,7 @@ namespace Legion { #ifdef DEBUG_LEGION assert(expr != NULL); #endif - if (expr->remove_expression_reference()) + if (expr->remove_base_expression_reference(IS_EXPR_REF)) delete expr; } @@ -8666,7 +8666,7 @@ namespace Legion { pending_analyses(0) //-------------------------------------------------------------------------- { - set_expr->add_expression_reference(); + set_expr->add_nested_expression_reference(did); if (index_space_node != NULL) { #ifdef DEBUG_LEGION @@ -8702,7 +8702,7 @@ namespace Legion { EquivalenceSet::~EquivalenceSet(void) //-------------------------------------------------------------------------- { - if (set_expr->remove_expression_reference()) + if (set_expr->remove_nested_expression_reference(did)) delete set_expr; if ((index_space_node != NULL) && index_space_node->remove_nested_resource_ref(did)) @@ -8754,7 +8754,7 @@ namespace Legion { for (FieldMaskSet::const_iterator it = unrefined_remainders.begin(); it != unrefined_remainders.end(); it++) - if (it->first->remove_expression_reference()) + if (it->first->remove_nested_expression_reference(did)) delete it->first; } if (subset_exprs != NULL) @@ -9028,7 +9028,7 @@ namespace Legion { to_delete.begin(); it != to_delete.end(); it++) { unrefined_remainders.erase(*it); - if ((*it)->remove_expression_reference()) + if ((*it)->remove_nested_expression_reference(did)) delete (*it); } unrefined_remainders.tighten_valid_mask(); @@ -9266,7 +9266,7 @@ namespace Legion { assert(unrefined_remainders.get_valid_mask() * overlap); #endif if (unrefined_remainders.insert(diff_expr, overlap)) - diff_expr->add_expression_reference(); + diff_expr->add_nested_expression_reference(did); } } // Remove these fields from the overlap indicating @@ -9455,7 +9455,7 @@ namespace Legion { it->set_mask); #endif if (unrefined_remainders.insert(remainder, it->set_mask)) - remainder->add_expression_reference(); + remainder->add_nested_expression_reference(did); } } } @@ -9499,7 +9499,7 @@ namespace Legion { to_filter); #endif if (unrefined_remainders.insert(remainder, to_filter)) - remainder->add_expression_reference(); + remainder->add_nested_expression_reference(did); } } } @@ -9596,7 +9596,7 @@ namespace Legion { forest->subtract_index_spaces(set_expr, expr); #endif if (unrefined_remainders.insert(diff, ray_mask)) - diff->add_expression_reference(); + diff->add_nested_expression_reference(did); ray_mask.clear(); } } @@ -10245,7 +10245,7 @@ namespace Legion { unrefined_remainders.clear(); // Defer removing the references on these expressions until // the migration has been done - DeferRemoveRefArgs args(references); + DeferRemoveRefArgs args(references, did); runtime->issue_runtime_meta_task(args, LG_THROUGHPUT_WORK_PRIORITY, done_migration); } @@ -10426,7 +10426,7 @@ namespace Legion { FieldMask mask; derez.deserialize(mask); if (unrefined_remainders.insert(expr, mask)) - expr->add_expression_reference(); + expr->add_nested_expression_reference(did); } size_t num_disjoint_refinements; derez.deserialize(num_disjoint_refinements); @@ -13395,7 +13395,7 @@ namespace Legion { assert(unrefined_remainders.get_valid_mask() * finalize_mask); #endif if (unrefined_remainders.insert(diff_expr, finalize_mask)) - diff_expr->add_expression_reference(); + diff_expr->add_nested_expression_reference(did); } } @@ -13438,7 +13438,7 @@ namespace Legion { it = to_delete.begin(); it != to_delete.end(); it++) { unrefined_remainders.erase(*it); - if ((*it)->remove_expression_reference()) + if ((*it)->remove_nested_expression_reference(did)) delete (*it); } unrefined_remainders.tighten_valid_mask(); @@ -13448,7 +13448,7 @@ namespace Legion { for (FieldMaskSet::const_iterator it = to_add.begin(); it != to_add.end(); it++) if (unrefined_remainders.insert(it->first, it->second)) - it->first->add_expression_reference(); + it->first->add_nested_expression_reference(did); } } @@ -13865,7 +13865,7 @@ namespace Legion { //-------------------------------------------------------------------------- { if (local) - expr->add_expression_reference(); + expr->add_base_expression_reference(IS_EXPR_REF); } //-------------------------------------------------------------------------- @@ -13885,7 +13885,8 @@ namespace Legion { // Clean up our ray mask delete dargs->ray_mask; // Remove our expression reference too - if (dargs->is_local && dargs->expr->remove_expression_reference()) + if (dargs->is_local && + dargs->expr->remove_base_expression_reference(IS_EXPR_REF)) delete dargs->expr; } @@ -13982,7 +13983,7 @@ namespace Legion { //-------------------------------------------------------------------------- { if (is_local) - expr->add_expression_reference(); + expr->add_base_expression_reference(IS_EXPR_REF); } //-------------------------------------------------------------------------- @@ -14072,7 +14073,8 @@ namespace Legion { // Once construction is complete then we do the registration set->register_with_runtime(NULL/*no remote registration needed*/); // Remove our expression reference too - if (dargs->is_local && dargs->expr->remove_expression_reference()) + if (dargs->is_local && + dargs->expr->remove_base_expression_reference(IS_EXPR_REF)) delete dargs->expr; } @@ -14083,7 +14085,7 @@ namespace Legion { const DeferRemoveRefArgs *dargs = (const DeferRemoveRefArgs*)args; for (std::vector::const_iterator it = dargs->references->begin(); it != dargs->references->end(); it++) - if ((*it)->remove_expression_reference()) + if ((*it)->remove_nested_expression_reference(dargs->source)) delete (*it); delete dargs->references; } diff --git a/runtime/legion/legion_analysis.h b/runtime/legion/legion_analysis.h index 8bdf470e1d..01681648bc 100644 --- a/runtime/legion/legion_analysis.h +++ b/runtime/legion/legion_analysis.h @@ -2231,11 +2231,13 @@ namespace Legion { public: static const LgTaskID TASK_ID = LG_DEFER_REMOVE_EQ_REF_TASK_ID; public: - DeferRemoveRefArgs(std::vector *refs) + DeferRemoveRefArgs(std::vector *refs, + DistributedID src) : LgTaskArgs(implicit_provenance), - references(refs) { } + references(refs), source(src) { } public: std::vector *const references; + const DistributedID source; }; protected: enum EqState { diff --git a/runtime/legion/legion_instances.cc b/runtime/legion/legion_instances.cc index 525c7b7a17..cbf204d642 100644 --- a/runtime/legion/legion_instances.cc +++ b/runtime/legion/legion_instances.cc @@ -548,9 +548,9 @@ namespace Legion { if (layout != NULL) layout->add_reference(); if (field_space_node != NULL) - field_space_node->add_base_gc_ref(PHYSICAL_MANAGER_REF); + field_space_node->add_nested_gc_ref(did); if (instance_domain != NULL) - instance_domain->add_expression_reference(); + instance_domain->add_nested_expression_reference(did); } //-------------------------------------------------------------------------- @@ -560,10 +560,10 @@ namespace Legion { if ((layout != NULL) && layout->remove_reference()) delete layout; if ((field_space_node != NULL) && - field_space_node->remove_base_gc_ref(PHYSICAL_MANAGER_REF)) + field_space_node->remove_nested_gc_ref(did)) delete field_space_node; if ((instance_domain != NULL) && - instance_domain->remove_expression_reference()) + instance_domain->remove_nested_expression_reference(did)) delete instance_domain; } @@ -1592,7 +1592,7 @@ namespace Legion { //-------------------------------------------------------------------------- { if (local_is) - local_expr->add_expression_reference(); + local_expr->add_base_expression_reference(IS_EXPR_REF); } //-------------------------------------------------------------------------- @@ -1613,7 +1613,8 @@ namespace Legion { dargs->piece_list_size, space_node, dargs->tree_id, constraints, dargs->use_event, dargs->redop, dargs->shadow_instance); // Remove the local expression reference if necessary - if (dargs->local_is && dargs->local_expr->remove_expression_reference()) + if (dargs->local_is && + dargs->local_expr->remove_base_expression_reference(IS_EXPR_REF)) delete dargs->local_expr; } @@ -3021,7 +3022,7 @@ namespace Legion { //-------------------------------------------------------------------------- { if (local_is) - local_expr->add_expression_reference(); + local_expr->add_base_expression_reference(IS_EXPR_REF); } //-------------------------------------------------------------------------- @@ -3044,7 +3045,8 @@ namespace Legion { dargs->piece_list_size, space_node, dargs->tree_id, constraints, dargs->use_event, dargs->redop); // Remove the local expression reference if necessary - if (dargs->local_is && dargs->local_expr->remove_expression_reference()) + if (dargs->local_is && + dargs->local_expr->remove_base_expression_reference(IS_EXPR_REF)) delete dargs->local_expr; } diff --git a/runtime/legion/legion_trace.cc b/runtime/legion/legion_trace.cc index 0d37521738..ce3cc507b3 100644 --- a/runtime/legion/legion_trace.cc +++ b/runtime/legion/legion_trace.cc @@ -5548,14 +5548,14 @@ namespace Legion { assert(precondition_idx < tpl.events.size()); assert(expr != NULL); #endif - expr->add_expression_reference(); + expr->add_base_expression_reference(TRACE_REF); } //-------------------------------------------------------------------------- IssueCopy::~IssueCopy(void) //-------------------------------------------------------------------------- { - if (expr->remove_expression_reference()) + if (expr->remove_base_expression_reference(TRACE_REF)) delete expr; } @@ -5646,7 +5646,7 @@ namespace Legion { assert(precondition_idx < tpl.events.size()); assert(expr != NULL); #endif - expr->add_expression_reference(); + expr->add_base_expression_reference(TRACE_REF); indirections.resize(indirects.size()); for (unsigned idx = 0; idx < indirects.size(); idx++) indirections[idx] = indirects[idx]->clone(); @@ -5656,7 +5656,7 @@ namespace Legion { IssueIndirect::~IssueIndirect(void) //-------------------------------------------------------------------------- { - if (expr->remove_expression_reference()) + if (expr->remove_base_expression_reference(TRACE_REF)) delete expr; for (unsigned idx = 0; idx < indirections.size(); idx++) delete indirections[idx]; @@ -5747,7 +5747,7 @@ namespace Legion { assert(precondition_idx < tpl.events.size()); assert(expr != NULL); #endif - expr->add_expression_reference(); + expr->add_base_expression_reference(TRACE_REF); src->add_base_resource_ref(TRACE_REF); dst->add_base_resource_ref(TRACE_REF); } @@ -5756,7 +5756,7 @@ namespace Legion { GPUReduction::~GPUReduction(void) //-------------------------------------------------------------------------- { - if (expr->remove_expression_reference()) + if (expr->remove_base_expression_reference(TRACE_REF)) delete expr; if (src->remove_base_resource_ref(TRACE_REF)) delete src; @@ -5846,7 +5846,7 @@ namespace Legion { assert(fields.size() > 0); assert(precondition_idx < tpl.events.size()); #endif - expr->add_expression_reference(); + expr->add_base_expression_reference(TRACE_REF); fill_value = malloc(fill_size); memcpy(fill_value, value, fill_size); } @@ -5855,7 +5855,7 @@ namespace Legion { IssueFill::~IssueFill(void) //-------------------------------------------------------------------------- { - if (expr->remove_expression_reference()) + if (expr->remove_base_expression_reference(TRACE_REF)) delete expr; free(fill_value); } diff --git a/runtime/legion/legion_views.cc b/runtime/legion/legion_views.cc index 13daf44135..dcb15c3493 100644 --- a/runtime/legion/legion_views.cc +++ b/runtime/legion/legion_views.cc @@ -749,16 +749,19 @@ namespace Legion { InstanceView *view, IndexSpaceExpression *exp) : context(ctx), manager(man), inst_view(view), view_expr(exp), view_volume(view_expr->get_volume()), +#if defined(DEBUG_LEGION_GC) || defined(LEGION_GC) + view_did(view->did), +#endif invalid_fields(FieldMask(LEGION_FIELD_MASK_FIELD_ALL_ONES)) //-------------------------------------------------------------------------- { - view_expr->add_expression_reference(); + view_expr->add_nested_expression_reference(view->did); } //-------------------------------------------------------------------------- ExprView::ExprView(const ExprView &rhs) : context(rhs.context), manager(rhs.manager), inst_view(rhs.inst_view), - view_expr(rhs.view_expr), view_volume(rhs.view_volume) + view_did(0), view_expr(rhs.view_expr), view_volume(rhs.view_volume) //-------------------------------------------------------------------------- { // should never be called @@ -769,8 +772,14 @@ namespace Legion { ExprView::~ExprView(void) //-------------------------------------------------------------------------- { - if (view_expr->remove_expression_reference()) +#if defined(DEBUG_LEGION_GC) || defined(LEGION_GC) + if (view_expr->remove_nested_expression_reference(view_did)) + delete view_expr; +#else + // We can lie about the did here since its not actually used + if (view_expr->remove_nested_expression_reference(0/*bogus did*/)) delete view_expr; +#endif if (!subviews.empty()) { for (FieldMaskSet::iterator it = subviews.begin(); diff --git a/runtime/legion/legion_views.h b/runtime/legion/legion_views.h index 6186c35701..d8c6e5fb35 100644 --- a/runtime/legion/legion_views.h +++ b/runtime/legion/legion_views.h @@ -443,6 +443,9 @@ namespace Legion { InstanceView *const inst_view; IndexSpaceExpression *const view_expr; const size_t view_volume; +#if defined(DEBUG_LEGION_GC) || defined(LEGION_GC) + const DistributedID view_did; +#endif // This is publicly mutable and protected by expr_lock from // the owner inst_view FieldMask invalid_fields; diff --git a/runtime/legion/region_tree.cc b/runtime/legion/region_tree.cc index eaef5703bd..999de75122 100644 --- a/runtime/legion/region_tree.cc +++ b/runtime/legion/region_tree.cc @@ -6342,7 +6342,6 @@ namespace Legion { IndexSpaceExprID remote_expr_id; derez.deserialize(remote_expr_id); IndexSpaceExpression *result = unpack_expression_value(derez, source); - result->add_expression_reference(); { AutoLock l_lock(lookup_is_op_lock); #ifdef DEBUG_LEGION @@ -6427,7 +6426,7 @@ namespace Legion { { const TightenIndexSpaceArgs *targs = (const TightenIndexSpaceArgs*)args; targs->proxy_this->tighten_index_space(); - if (targs->proxy_this->remove_expression_tree_reference()) + if (targs->proxy_this->remove_base_expression_reference(IS_EXPR_REF)) delete targs->proxy_this; } @@ -6481,7 +6480,7 @@ namespace Legion { // forest has given us a reference back on it, see if we're the first // ones to write it, if not we can remove the reference now if (!__sync_bool_compare_and_swap(&canonical, NULL, expr)) - expr->remove_expression_tree_reference(); + expr->remove_canonical_reference(); return expr; } @@ -6577,7 +6576,7 @@ namespace Legion { //-------------------------------------------------------------------------- { // We always keep a reference on ourself until we get invalidated - add_expression_tree_reference(); + add_base_resource_ref(IS_EXPR_REF); #ifdef LEGION_GC log_garbage.info("GC Index Expr %lld %d %lld", LEGION_DISTRIBUTED_ID_FILTER(this->did), local_space, expr_id); @@ -6651,47 +6650,72 @@ namespace Legion { } //-------------------------------------------------------------------------- - void IndexSpaceOperation::add_expression_reference( - std::set &applied_events, unsigned count) + void IndexSpaceOperation::add_base_expression_reference( + ReferenceSource source, ReferenceMutator *mutator, unsigned count) + //-------------------------------------------------------------------------- + { + if (mutator == NULL) + { + LocalReferenceMutator local_mutator; + add_base_gc_ref(source, &local_mutator, count); + } + else + add_base_gc_ref(source, mutator, count); + } + + //-------------------------------------------------------------------------- + void IndexSpaceOperation::add_nested_expression_reference( + DistributedID source, std::set &applied_events, unsigned count) //-------------------------------------------------------------------------- { WrapperReferenceMutator mutator(applied_events); - add_expression_reference(&mutator, count); + add_nested_expression_reference(source, &mutator, count); } //-------------------------------------------------------------------------- - void IndexSpaceOperation::add_expression_reference( - ReferenceMutator *mutator, unsigned count) + void IndexSpaceOperation::add_nested_expression_reference( + DistributedID source, ReferenceMutator *mutator, unsigned count) //-------------------------------------------------------------------------- { if (mutator == NULL) { LocalReferenceMutator local_mutator; - add_base_gc_ref(IS_EXPR_REF, &local_mutator, count); + add_nested_gc_ref(source, &local_mutator, count); } else - add_base_gc_ref(IS_EXPR_REF, mutator, count); + add_nested_gc_ref(source, mutator, count); + } + + //-------------------------------------------------------------------------- + bool IndexSpaceOperation::remove_base_expression_reference( + ReferenceSource source, unsigned count) + //-------------------------------------------------------------------------- + { + return remove_base_gc_ref(source, NULL/*mutator*/, count); } //-------------------------------------------------------------------------- - bool IndexSpaceOperation::remove_expression_reference(unsigned count) + bool IndexSpaceOperation::remove_nested_expression_reference( + DistributedID source, unsigned count) //-------------------------------------------------------------------------- { - return remove_base_gc_ref(IS_EXPR_REF, NULL/*mutator*/, count); + return remove_nested_gc_ref(source, NULL/*mutator*/, count); } //-------------------------------------------------------------------------- - void IndexSpaceOperation::add_expression_tree_reference(unsigned count) + void IndexSpaceOperation::add_tree_expression_reference(DistributedID id, + unsigned count) //-------------------------------------------------------------------------- { - add_base_resource_ref(IS_EXPR_REF, count); + add_nested_resource_ref(id, count); } //-------------------------------------------------------------------------- - bool IndexSpaceOperation::remove_expression_tree_reference(unsigned count) + bool IndexSpaceOperation::remove_tree_expression_reference(DistributedID id, + unsigned count) //-------------------------------------------------------------------------- { - return remove_base_resource_ref(IS_EXPR_REF, count); + return remove_nested_resource_ref(id, count); } //-------------------------------------------------------------------------- @@ -6722,7 +6746,7 @@ namespace Legion { { // Add a reference to prevent the parents from being collected // as we're traversing up the tree - (*it)->add_base_resource_ref(IS_EXPR_REF); + (*it)->add_tree_expression_reference(did); parents[idx] = (*it); } } @@ -6733,7 +6757,7 @@ namespace Legion { { (*it)->invalidate_operation(to_remove); // Remove the reference when we're done with the parents - if ((*it)->remove_base_resource_ref(IS_EXPR_REF)) + if ((*it)->remove_tree_expression_reference(did)) delete (*it); } } @@ -7482,7 +7506,7 @@ namespace Legion { parent_operations.begin(); it != parent_operations.end(); it++, idx++) { - (*it)->add_expression_tree_reference(); + (*it)->add_tree_expression_reference(did); parents[idx] = (*it); } } @@ -7493,7 +7517,7 @@ namespace Legion { // Remove any references that we have on the parents for (std::vector::const_iterator it = parents.begin(); it != parents.end(); it++) - if ((*it)->remove_expression_tree_reference()) + if ((*it)->remove_tree_expression_reference(did)) delete (*it); } } @@ -8669,54 +8693,79 @@ namespace Legion { } //-------------------------------------------------------------------------- - void IndexSpaceNode::add_expression_reference( - std::set &applied_events, unsigned count) + void IndexSpaceNode::add_base_expression_reference( + ReferenceSource source, ReferenceMutator *mutator, unsigned count) + //-------------------------------------------------------------------------- + { + if (mutator == NULL) + { + LocalReferenceMutator local_mutator; + add_base_gc_ref(source, &local_mutator, count); + } + else + add_base_gc_ref(source, mutator, count); + } + + //-------------------------------------------------------------------------- + void IndexSpaceNode::add_nested_expression_reference( + DistributedID source, std::set &applied_events, unsigned count) //-------------------------------------------------------------------------- { WrapperReferenceMutator mutator(applied_events); - add_expression_reference(&mutator, count); + add_nested_expression_reference(source, &mutator, count); } //-------------------------------------------------------------------------- - void IndexSpaceNode::add_expression_reference( - ReferenceMutator *mutator, unsigned count) + void IndexSpaceNode::add_nested_expression_reference( + DistributedID source, ReferenceMutator *mutator, unsigned count) //-------------------------------------------------------------------------- { if (mutator == NULL) { LocalReferenceMutator local_mutator; - add_base_gc_ref(IS_EXPR_REF, &local_mutator, count); + add_nested_gc_ref(source, &local_mutator, count); } else - add_base_gc_ref(IS_EXPR_REF, mutator, count); + add_nested_gc_ref(source, mutator, count); + } + + //-------------------------------------------------------------------------- + bool IndexSpaceNode::remove_base_expression_reference( + ReferenceSource source, unsigned count) + //-------------------------------------------------------------------------- + { + return remove_base_gc_ref(source, NULL/*mutator*/, count); } //-------------------------------------------------------------------------- - bool IndexSpaceNode::remove_expression_reference(unsigned count) + bool IndexSpaceNode::remove_nested_expression_reference( + DistributedID source, unsigned count) //-------------------------------------------------------------------------- { - return remove_base_gc_ref(IS_EXPR_REF, NULL/*mutator*/, count); + return remove_nested_gc_ref(source, NULL/*mutator*/, count); } //-------------------------------------------------------------------------- - void IndexSpaceNode::add_expression_tree_reference(unsigned count) + void IndexSpaceNode::add_tree_expression_reference(DistributedID id, + unsigned count) //-------------------------------------------------------------------------- { - add_base_resource_ref(IS_EXPR_REF, count); + add_nested_resource_ref(id, count); } //-------------------------------------------------------------------------- - bool IndexSpaceNode::remove_expression_tree_reference(unsigned count) + bool IndexSpaceNode::remove_tree_expression_reference(DistributedID id, + unsigned count) //-------------------------------------------------------------------------- { - return remove_base_resource_ref(IS_EXPR_REF, count); + return remove_nested_resource_ref(id, count); } //-------------------------------------------------------------------------- bool IndexSpaceNode::remove_operation(RegionTreeForest *forest) //-------------------------------------------------------------------------- { - return remove_expression_tree_reference(); + return remove_base_resource_ref(IS_EXPR_REF); } //-------------------------------------------------------------------------- diff --git a/runtime/legion/region_tree.h b/runtime/legion/region_tree.h index 2abce33a6f..79ee089030 100644 --- a/runtime/legion/region_tree.h +++ b/runtime/legion/region_tree.h @@ -1028,7 +1028,7 @@ namespace Legion { TightenIndexSpaceArgs(IndexSpaceExpression *proxy) : LgTaskArgs(implicit_provenance), proxy_this(proxy) - { proxy->add_expression_tree_reference(); } + { proxy->add_base_expression_reference(IS_EXPR_REF); } public: IndexSpaceExpression *const proxy_this; }; @@ -1094,13 +1094,20 @@ namespace Legion { public: virtual bool try_add_canonical_reference(void) = 0; virtual bool remove_canonical_reference(void) = 0; - virtual void add_expression_reference(std::set &applied_events, - unsigned count = 1) = 0; - virtual void add_expression_reference(ReferenceMutator *mutator = NULL, - unsigned count = 1) = 0; - virtual bool remove_expression_reference(unsigned count = 1) = 0; - virtual void add_expression_tree_reference(unsigned count = 1) = 0; - virtual bool remove_expression_tree_reference(unsigned count = 1) = 0; + virtual void add_base_expression_reference(ReferenceSource source, + ReferenceMutator *mutator = NULL, unsigned count = 1) = 0; + virtual void add_nested_expression_reference(DistributedID source, + std::set &applied_events, unsigned count = 1) = 0; + virtual void add_nested_expression_reference(DistributedID source, + ReferenceMutator *mutator = NULL, unsigned count = 1) = 0; + virtual bool remove_base_expression_reference(ReferenceSource source, + unsigned count = 1) = 0; + virtual bool remove_nested_expression_reference(DistributedID source, + unsigned count = 1) = 0; + virtual void add_tree_expression_reference(DistributedID source, + unsigned count = 1) = 0; + virtual bool remove_tree_expression_reference(DistributedID source, + unsigned count = 1) = 0; public: virtual bool remove_operation(RegionTreeForest *forest) = 0; virtual IndexSpaceNode* create_node(IndexSpace handle, @@ -1348,13 +1355,20 @@ namespace Legion { public: virtual bool try_add_canonical_reference(void); virtual bool remove_canonical_reference(void); - virtual void add_expression_reference(std::set &applied_events, - unsigned count = 1); - virtual void add_expression_reference(ReferenceMutator *mutator = NULL, - unsigned count = 1); - virtual bool remove_expression_reference(unsigned count = 1); - virtual void add_expression_tree_reference(unsigned count = 1); - virtual bool remove_expression_tree_reference(unsigned count = 1); + virtual void add_base_expression_reference(ReferenceSource source, + ReferenceMutator *mutator = NULL, unsigned count = 1); + virtual void add_nested_expression_reference(DistributedID source, + std::set &applied_events, unsigned count = 1); + virtual void add_nested_expression_reference(DistributedID source, + ReferenceMutator *mutator = NULL, unsigned count = 1); + virtual bool remove_base_expression_reference(ReferenceSource source, + unsigned count = 1); + virtual bool remove_nested_expression_reference(DistributedID source, + unsigned count = 1); + virtual void add_tree_expression_reference(DistributedID source, + unsigned count = 1); + virtual bool remove_tree_expression_reference(DistributedID source, + unsigned count = 1); public: virtual bool remove_operation(RegionTreeForest *forest) = 0; virtual IndexSpaceNode* create_node(IndexSpace handle, @@ -1963,13 +1977,20 @@ namespace Legion { public: virtual bool try_add_canonical_reference(void); virtual bool remove_canonical_reference(void); - virtual void add_expression_reference(std::set &applied_events, - unsigned count = 1); - virtual void add_expression_reference(ReferenceMutator *mutator = NULL, - unsigned count = 1); - virtual bool remove_expression_reference(unsigned count = 1); - virtual void add_expression_tree_reference(unsigned count = 1); - virtual bool remove_expression_tree_reference(unsigned count = 1); + virtual void add_base_expression_reference(ReferenceSource source, + ReferenceMutator *mutator = NULL, unsigned count = 1); + virtual void add_nested_expression_reference(DistributedID source, + std::set &applied_events, unsigned count = 1); + virtual void add_nested_expression_reference(DistributedID source, + ReferenceMutator *mutator = NULL, unsigned count = 1); + virtual bool remove_base_expression_reference(ReferenceSource source, + unsigned count = 1); + virtual bool remove_nested_expression_reference(DistributedID source, + unsigned count = 1); + virtual void add_tree_expression_reference(DistributedID source, + unsigned count = 1); + virtual bool remove_tree_expression_reference(DistributedID source, + unsigned count = 1); public: virtual bool remove_operation(RegionTreeForest *forest); virtual IndexSpaceNode* create_node(IndexSpace handle, diff --git a/runtime/legion/region_tree.inl b/runtime/legion/region_tree.inl index 3e3e3e2838..856622a524 100644 --- a/runtime/legion/region_tree.inl +++ b/runtime/legion/region_tree.inl @@ -1876,7 +1876,7 @@ namespace Legion { #endif // Add the parent and the reference sub->add_parent_operation(this); - sub->add_expression_tree_reference(); + sub->add_tree_expression_reference(this->did); // Then get the realm index space expression ApEvent precondition = sub->get_expr_index_space( &spaces[idx], this->type_tag, false/*need tight result*/); @@ -1946,7 +1946,7 @@ namespace Legion { { // Remove references from our sub expressions for (unsigned idx = 0; idx < sub_expressions.size(); idx++) - if (sub_expressions[idx]->remove_expression_tree_reference()) + if (sub_expressions[idx]->remove_tree_expression_reference(this->did)) delete sub_expressions[idx]; } @@ -1997,7 +1997,7 @@ namespace Legion { forest->remove_union_operation(this, sub_expressions); // Remove our expression reference added by invalidate_operation // and return true if we should be deleted - return this->remove_expression_tree_reference(); + return this->remove_base_resource_ref(IS_EXPR_REF); } //-------------------------------------------------------------------------- @@ -2019,7 +2019,7 @@ namespace Legion { #endif // Add the parent and the reference sub->add_parent_operation(this); - sub->add_expression_tree_reference(); + sub->add_tree_expression_reference(this->did); ApEvent precondition = sub->get_expr_index_space( &spaces[idx], this->type_tag, false/*need tight result*/); if (precondition.exists()) @@ -2089,7 +2089,7 @@ namespace Legion { { // Remove references from our sub expressions for (unsigned idx = 0; idx < sub_expressions.size(); idx++) - if (sub_expressions[idx]->remove_expression_tree_reference()) + if (sub_expressions[idx]->remove_tree_expression_reference(this->did)) delete sub_expressions[idx]; } @@ -2141,7 +2141,7 @@ namespace Legion { forest->remove_intersection_operation(this, sub_expressions); // Remove our expression reference added by invalidate_operation // and return true if we should be deleted - return this->remove_expression_tree_reference(); + return this->remove_base_resource_ref(IS_EXPR_REF); } //-------------------------------------------------------------------------- @@ -2160,7 +2160,7 @@ namespace Legion { { // Special case for when the expressions are the same lhs->add_parent_operation(this); - lhs->add_expression_tree_reference(); + lhs->add_tree_expression_reference(this->did); this->realm_index_space = Realm::IndexSpace::make_empty(); this->tight_index_space = Realm::IndexSpace::make_empty(); this->realm_index_space_ready = ApEvent::NO_AP_EVENT; @@ -2172,8 +2172,8 @@ namespace Legion { // Add the parent and the references lhs->add_parent_operation(this); rhs->add_parent_operation(this); - lhs->add_expression_tree_reference(); - rhs->add_expression_tree_reference(); + lhs->add_tree_expression_reference(this->did); + rhs->add_tree_expression_reference(this->did); ApEvent left_ready = lhs->get_expr_index_space(&lhs_space, this->type_tag, false/*tight*/); ApEvent right_ready = @@ -2239,9 +2239,9 @@ namespace Legion { //-------------------------------------------------------------------------- { if ((rhs != NULL) && (lhs != rhs) && - rhs->remove_expression_tree_reference()) + rhs->remove_tree_expression_reference(this->did)) delete rhs; - if ((lhs != NULL) && lhs->remove_expression_tree_reference()) + if ((lhs != NULL) && lhs->remove_tree_expression_reference(this->did)) delete lhs; } @@ -2294,7 +2294,7 @@ namespace Legion { forest->remove_subtraction_operation(this, lhs, rhs); // Remove our expression reference added by invalidate_operation // and return true if we should be deleted - return this->remove_expression_tree_reference(); + return this->remove_base_resource_ref(IS_EXPR_REF); } ///////////////////////////////////////////////////////////// From a9221c87e9512b1d123dc788fad860e478fc471a Mon Sep 17 00:00:00 2001 From: Mike Bauer Date: Wed, 1 Dec 2021 15:57:18 -0800 Subject: [PATCH 04/36] legion/tools: more fixes for new distributed expressions --- runtime/legion/garbage_collection.h | 4 ++-- runtime/legion/legion_analysis.cc | 6 +++--- runtime/legion/legion_views.cc | 5 ++++- tools/legion_gc.py | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/runtime/legion/garbage_collection.h b/runtime/legion/garbage_collection.h index 837c6be37d..b752d3ede2 100644 --- a/runtime/legion/garbage_collection.h +++ b/runtime/legion/garbage_collection.h @@ -76,7 +76,7 @@ namespace Legion { CONTEXT_REF = 17, RESTRICTED_REF = 18, VERSION_STATE_TREE_REF = 19, - PHYSICAL_MANAGER_REF = 20, + PHYSICAL_USER_REF = 20, LOGICAL_VIEW_REF = 21, REGION_TREE_REF = 22, LAYOUT_DESC_REF = 23, @@ -117,7 +117,7 @@ namespace Legion { "Context Reference", \ "Restricted Reference", \ "Version State Tree Reference", \ - "Physical Manager Reference", \ + "Physical User Reference", \ "Logical View Reference", \ "Region Tree Reference", \ "Layout Description Reference", \ diff --git a/runtime/legion/legion_analysis.cc b/runtime/legion/legion_analysis.cc index 7d2e0953ce..09a6d12949 100644 --- a/runtime/legion/legion_analysis.cc +++ b/runtime/legion/legion_analysis.cc @@ -80,7 +80,7 @@ namespace Legion { #ifdef DEBUG_LEGION assert(expr != NULL); #endif - expr->add_base_expression_reference(IS_EXPR_REF); + expr->add_base_expression_reference(PHYSICAL_USER_REF); } #else //-------------------------------------------------------------------------- @@ -92,7 +92,7 @@ namespace Legion { #ifdef DEBUG_LEGION assert(expr != NULL); #endif - expr->add_base_expression_reference(IS_EXPR_REF); + expr->add_base_expression_reference(PHYSICAL_USER_REF); } #endif @@ -116,7 +116,7 @@ namespace Legion { #ifdef DEBUG_LEGION assert(expr != NULL); #endif - if (expr->remove_base_expression_reference(IS_EXPR_REF)) + if (expr->remove_base_expression_reference(PHYSICAL_USER_REF)) delete expr; } diff --git a/runtime/legion/legion_views.cc b/runtime/legion/legion_views.cc index dcb15c3493..b1750141d7 100644 --- a/runtime/legion/legion_views.cc +++ b/runtime/legion/legion_views.cc @@ -761,7 +761,10 @@ namespace Legion { //-------------------------------------------------------------------------- ExprView::ExprView(const ExprView &rhs) : context(rhs.context), manager(rhs.manager), inst_view(rhs.inst_view), - view_did(0), view_expr(rhs.view_expr), view_volume(rhs.view_volume) + view_expr(rhs.view_expr), view_volume(rhs.view_volume) +#if defined(DEBUG_LEGION_GC) || defined(LEGION_GC) + , view_did(rhs.view_did) +#endif //-------------------------------------------------------------------------- { // should never be called diff --git a/tools/legion_gc.py b/tools/legion_gc.py index 14f7ed4beb..2636660b29 100755 --- a/tools/legion_gc.py +++ b/tools/legion_gc.py @@ -636,7 +636,7 @@ def __init__(self, did, node, handle): self.handle = handle def __repr__(self): - return 'Index Expressions '+str(self.did)+' (Node='+str(self.node)+') ExprID '+str(self.handle) + return 'Index Expression '+str(self.did)+' (Node='+str(self.node)+') ExprID '+str(self.handle) class FieldSpace(Base): def __init__(self, did, node, handle): From bce2b462128b8fa30b63a5394f20ca6a10935cae Mon Sep 17 00:00:00 2001 From: Mike Bauer Date: Wed, 1 Dec 2021 23:48:55 -0800 Subject: [PATCH 05/36] legion: more work on refactoring for distributed expressions to pass provenance information through canonical references --- runtime/legion/garbage_collection.cc | 40 ++++++++++++++++++++++++++++ runtime/legion/garbage_collection.h | 5 ++-- runtime/legion/region_tree.cc | 21 ++++++++------- runtime/legion/region_tree.h | 15 ++++++----- runtime/legion/region_tree.inl | 5 ++-- 5 files changed, 65 insertions(+), 21 deletions(-) diff --git a/runtime/legion/garbage_collection.cc b/runtime/legion/garbage_collection.cc index 9a26d6c64e..f9b909493c 100644 --- a/runtime/legion/garbage_collection.cc +++ b/runtime/legion/garbage_collection.cc @@ -486,6 +486,46 @@ namespace Legion { return true; } + //-------------------------------------------------------------------------- + bool DistributedCollectable::check_resource_and_increment( + DistributedID source, int cnt) + //-------------------------------------------------------------------------- + { + AutoLock gc(gc_lock); + if (current_state == DELETED_STATE) + return false; +#ifdef DEBUG_LEGION + assert(cnt >= 0); +#endif +#ifdef LEGION_GC + log_nested_ref(RESOURCE_REF_KIND, did, local_space, source, cnt); +#endif +#ifndef DEBUG_LEGION_GC + int previous = __sync_fetch_and_add(&resource_references, cnt); +#ifdef DEBUG_LEGION + assert(previous >= 0); +#endif + if (previous == 0) + has_resource_references = true; +#else + resource_references++; + source = LEGION_DISTRIBUTED_ID_FILTER(source); + std::map::iterator finder = + detailed_nested_resource_references.find(source); + if (finder == detailed_nested_resource_references.end()) + detailed_nested_resource_references[source] = cnt; + else + finder->second += cnt; + if (resource_references > cnt) + return true; +#ifdef DEBUG_LEGION + assert(!has_resource_references); +#endif + has_resource_references = true; +#endif + return true; + } + //-------------------------------------------------------------------------- void DistributedCollectable::add_resource_reference(void) //-------------------------------------------------------------------------- diff --git a/runtime/legion/garbage_collection.h b/runtime/legion/garbage_collection.h index b752d3ede2..fd2f8d4a1c 100644 --- a/runtime/legion/garbage_collection.h +++ b/runtime/legion/garbage_collection.h @@ -85,8 +85,7 @@ namespace Legion { TRACE_REF = 26, AGGREGATORE_REF = 27, FIELD_STATE_REF = 28, - CANONICAL_REF = 29, - LAST_SOURCE_REF = 30, + LAST_SOURCE_REF = 29, }; enum ReferenceKind { @@ -126,7 +125,6 @@ namespace Legion { "Physical Trace Reference", \ "Aggregator Reference", \ "Field State Reference", \ - "Canonical Index Space Expression Reference", \ } extern Realm::Logger log_garbage; @@ -336,6 +334,7 @@ namespace Legion { // Atomic check and increment operations inline bool check_valid_and_increment(ReferenceSource source,int cnt = 1); bool check_resource_and_increment(ReferenceSource source ,int cnt = 1); + bool check_resource_and_increment(DistributedID source, int cnt = 1); private: void add_gc_reference(ReferenceMutator *mutator); bool remove_gc_reference(ReferenceMutator *mutator); diff --git a/runtime/legion/region_tree.cc b/runtime/legion/region_tree.cc index 999de75122..a39d423ba3 100644 --- a/runtime/legion/region_tree.cc +++ b/runtime/legion/region_tree.cc @@ -6480,7 +6480,7 @@ namespace Legion { // forest has given us a reference back on it, see if we're the first // ones to write it, if not we can remove the reference now if (!__sync_bool_compare_and_swap(&canonical, NULL, expr)) - expr->remove_canonical_reference(); + expr->remove_canonical_reference(get_distributed_id()); return expr; } @@ -6557,7 +6557,8 @@ namespace Legion { #endif if (canonical == original) forest->remove_canonical_expression(canonical, volume); - else if (canonical->remove_canonical_reference()) + else if (canonical->remove_canonical_reference( + original->get_distributed_id())) delete canonical; } @@ -6636,17 +6637,17 @@ namespace Legion { } //-------------------------------------------------------------------------- - bool IndexSpaceOperation::try_add_canonical_reference(void) + bool IndexSpaceOperation::try_add_canonical_reference(DistributedID source) //-------------------------------------------------------------------------- { - return check_resource_and_increment(CANONICAL_REF); + return check_resource_and_increment(source); } //-------------------------------------------------------------------------- - bool IndexSpaceOperation::remove_canonical_reference(void) + bool IndexSpaceOperation::remove_canonical_reference(DistributedID source) //-------------------------------------------------------------------------- { - return remove_base_resource_ref(CANONICAL_REF); + return remove_nested_resource_ref(source); } //-------------------------------------------------------------------------- @@ -8679,17 +8680,17 @@ namespace Legion { } //-------------------------------------------------------------------------- - bool IndexSpaceNode::try_add_canonical_reference(void) + bool IndexSpaceNode::try_add_canonical_reference(DistributedID source) //-------------------------------------------------------------------------- { - return check_resource_and_increment(CANONICAL_REF); + return check_resource_and_increment(source); } //-------------------------------------------------------------------------- - bool IndexSpaceNode::remove_canonical_reference(void) + bool IndexSpaceNode::remove_canonical_reference(DistributedID source) //-------------------------------------------------------------------------- { - return remove_base_resource_ref(CANONICAL_REF); + return remove_nested_resource_ref(source); } //-------------------------------------------------------------------------- diff --git a/runtime/legion/region_tree.h b/runtime/legion/region_tree.h index 79ee089030..a12b3cde8e 100644 --- a/runtime/legion/region_tree.h +++ b/runtime/legion/region_tree.h @@ -1092,8 +1092,9 @@ namespace Legion { virtual void pack_expression_value(Serializer &rez, AddressSpaceID target) = 0; public: - virtual bool try_add_canonical_reference(void) = 0; - virtual bool remove_canonical_reference(void) = 0; + virtual DistributedID get_distributed_id(void) const = 0; + virtual bool try_add_canonical_reference(DistributedID source) = 0; + virtual bool remove_canonical_reference(DistributedID source) = 0; virtual void add_base_expression_reference(ReferenceSource source, ReferenceMutator *mutator = NULL, unsigned count = 1) = 0; virtual void add_nested_expression_reference(DistributedID source, @@ -1353,8 +1354,9 @@ namespace Legion { virtual void pack_expression_value(Serializer &rez, AddressSpaceID target) = 0; public: - virtual bool try_add_canonical_reference(void); - virtual bool remove_canonical_reference(void); + virtual DistributedID get_distributed_id(void) const { return did; } + virtual bool try_add_canonical_reference(DistributedID source); + virtual bool remove_canonical_reference(DistributedID source); virtual void add_base_expression_reference(ReferenceSource source, ReferenceMutator *mutator = NULL, unsigned count = 1); virtual void add_nested_expression_reference(DistributedID source, @@ -1975,8 +1977,9 @@ namespace Legion { virtual void pack_expression(Serializer &rez, AddressSpaceID target); virtual void pack_expression_value(Serializer &rez,AddressSpaceID target); public: - virtual bool try_add_canonical_reference(void); - virtual bool remove_canonical_reference(void); + virtual DistributedID get_distributed_id(void) const { return did; } + virtual bool try_add_canonical_reference(DistributedID source); + virtual bool remove_canonical_reference(DistributedID source); virtual void add_base_expression_reference(ReferenceSource source, ReferenceMutator *mutator = NULL, unsigned count = 1); virtual void add_nested_expression_reference(DistributedID source, diff --git a/runtime/legion/region_tree.inl b/runtime/legion/region_tree.inl index 856622a524..93a0afe5fd 100644 --- a/runtime/legion/region_tree.inl +++ b/runtime/legion/region_tree.inl @@ -1238,6 +1238,7 @@ namespace Legion { // No need to wait for the event, we know it is already triggered // because we called get_volume on this before we got here get_expr_index_space(&local_space, type_tag, true/*need tight result*/); + const DistributedID local_did = get_distributed_id(); for (std::set::const_iterator it = expressions.begin(); it != expressions.end(); it++) { @@ -1258,7 +1259,7 @@ namespace Legion { // We know that things are the same here // Try to add the expression reference, we can race with deletions // here though so handle the case we're we can't add a reference - if ((*it)->try_add_canonical_reference()) + if ((*it)->try_add_canonical_reference(local_did)) return (*it); else continue; @@ -1380,7 +1381,7 @@ namespace Legion { // If we get here that means we are congruent // Try to add the expression reference, we can race with deletions // here though so handle the case we're we can't add a reference - if ((*it)->try_add_canonical_reference()) + if ((*it)->try_add_canonical_reference(local_did)) return (*it); } // Did not find any congruences so add ourself From 63980d423cbdc626f6ef62443b5b44bd4430051b Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Thu, 2 Dec 2021 11:54:28 -0800 Subject: [PATCH 06/36] regent: Minor fix for kernel launch domain --- language/src/regent/cudahelper.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/language/src/regent/cudahelper.t b/language/src/regent/cudahelper.t index 95d547afe3..f209012c15 100644 --- a/language/src/regent/cudahelper.t +++ b/language/src/regent/cudahelper.t @@ -1340,7 +1340,7 @@ function cudahelper.codegen_kernel_call(cx, kernel_id, count, args, shared_mem_s else [grid].x, [grid].y, [grid].z = MAX_NUM_BLOCK, MAX_NUM_BLOCK, - [round_exp(num_blocks, MAX_NUM_BLOCK, MAX_NUM_BLOCK)] + [round_exp(num_blocks, MAX_NUM_BLOCK * MAX_NUM_BLOCK)] end end From c376f6845aaff97e357fdbd5d142c9eb6e6ab7fb Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Fri, 3 Dec 2021 11:49:58 -0800 Subject: [PATCH 07/36] regent: Better error message if trying to index a nil field in a symbol. --- language/src/regent/std_base.t | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/language/src/regent/std_base.t b/language/src/regent/std_base.t index c8ce2ae1ed..1a6765cb36 100644 --- a/language/src/regent/std_base.t +++ b/language/src/regent/std_base.t @@ -898,11 +898,11 @@ local symbol = {} function symbol:__index(field) local value = symbol[field] if value ~= nil then return value end - error("symbol has no field '" .. field .. "' (in lookup)", 2) + error("symbol has no field '" .. tostring(field) .. "' (in lookup)", 2) end function symbol:__newindex(field, value) - error("symbol has no field '" .. field .. "' (in assignment)", 2) + error("symbol has no field '" .. tostring(field) .. "' (in assignment)", 2) end do From 779121f14c70f6f697c8c86090228a5d2619c9ff Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Tue, 7 Dec 2021 15:14:37 -0800 Subject: [PATCH 08/36] Revert "legion: encode distributed ID kind for index space expressions" Revert "legion: refactor expression reference counting" This reverts commit ddbd82bcc23c684fda5c0fde86a1a738de56bc59. This reverts commit caf91f32fd55b52d6448ac1aaf4b746790687779. --- runtime/legion/region_tree.cc | 100 ++++++++++----------------------- runtime/legion/region_tree.h | 35 +++--------- runtime/legion/region_tree.inl | 24 ++++---- 3 files changed, 48 insertions(+), 111 deletions(-) diff --git a/runtime/legion/region_tree.cc b/runtime/legion/region_tree.cc index eaef5703bd..3a5c60bb7c 100644 --- a/runtime/legion/region_tree.cc +++ b/runtime/legion/region_tree.cc @@ -6427,7 +6427,7 @@ namespace Legion { { const TightenIndexSpaceArgs *targs = (const TightenIndexSpaceArgs*)args; targs->proxy_this->tighten_index_space(); - if (targs->proxy_this->remove_expression_tree_reference()) + if (targs->proxy_this->remove_expression_reference(true/*tree only*/)) delete targs->proxy_this; } @@ -6481,7 +6481,7 @@ namespace Legion { // forest has given us a reference back on it, see if we're the first // ones to write it, if not we can remove the reference now if (!__sync_bool_compare_and_swap(&canonical, NULL, expr)) - expr->remove_expression_tree_reference(); + expr->remove_expression_reference(true/*tree*/); return expr; } @@ -6570,14 +6570,14 @@ namespace Legion { IndexSpaceOperation::IndexSpaceOperation(TypeTag tag, OperationKind kind, RegionTreeForest *ctx) : IndexSpaceExpression(tag, ctx->runtime, inter_lock), - DistributedCollectable(ctx->runtime, LEGION_DISTRIBUTED_HELP_ENCODE( - ctx->runtime->get_available_distributed_id(), INDEX_EXPR_NODE_DC), + DistributedCollectable(ctx->runtime, + ctx->runtime->get_available_distributed_id(), ctx->runtime->address_space), context(ctx), origin_expr(this), op_kind(kind), invalidated(0) //-------------------------------------------------------------------------- { // We always keep a reference on ourself until we get invalidated - add_expression_tree_reference(); + add_expression_reference(true/*expr tree*/); #ifdef LEGION_GC log_garbage.info("GC Index Expr %lld %d %lld", LEGION_DISTRIBUTED_ID_FILTER(this->did), local_space, expr_id); @@ -6651,48 +6651,27 @@ namespace Legion { } //-------------------------------------------------------------------------- - void IndexSpaceOperation::add_expression_reference( - std::set &applied_events, unsigned count) + void IndexSpaceOperation::add_expression_reference(bool expr_tree) //-------------------------------------------------------------------------- { - WrapperReferenceMutator mutator(applied_events); - add_expression_reference(&mutator, count); - } - - //-------------------------------------------------------------------------- - void IndexSpaceOperation::add_expression_reference( - ReferenceMutator *mutator, unsigned count) - //-------------------------------------------------------------------------- - { - if (mutator == NULL) + if (!expr_tree) { - LocalReferenceMutator local_mutator; - add_base_gc_ref(IS_EXPR_REF, &local_mutator, count); + LocalReferenceMutator mutator; + add_base_gc_ref(IS_EXPR_REF, &mutator); } else - add_base_gc_ref(IS_EXPR_REF, mutator, count); + add_base_resource_ref(IS_EXPR_REF); } //-------------------------------------------------------------------------- - bool IndexSpaceOperation::remove_expression_reference(unsigned count) + bool IndexSpaceOperation::remove_expression_reference(bool expr_tree) //-------------------------------------------------------------------------- { - return remove_base_gc_ref(IS_EXPR_REF, NULL/*mutator*/, count); - } - - //-------------------------------------------------------------------------- - void IndexSpaceOperation::add_expression_tree_reference(unsigned count) - //-------------------------------------------------------------------------- - { - add_base_resource_ref(IS_EXPR_REF, count); - } - - //-------------------------------------------------------------------------- - bool IndexSpaceOperation::remove_expression_tree_reference(unsigned count) - //-------------------------------------------------------------------------- - { - return remove_base_resource_ref(IS_EXPR_REF, count); - } + if (expr_tree) + return remove_base_resource_ref(IS_EXPR_REF); + else + return remove_base_gc_ref(IS_EXPR_REF); + } //-------------------------------------------------------------------------- void IndexSpaceOperation::invalidate_operation( @@ -7482,7 +7461,7 @@ namespace Legion { parent_operations.begin(); it != parent_operations.end(); it++, idx++) { - (*it)->add_expression_tree_reference(); + (*it)->add_expression_reference(true/*expr tree*/); parents[idx] = (*it); } } @@ -7493,7 +7472,7 @@ namespace Legion { // Remove any references that we have on the parents for (std::vector::const_iterator it = parents.begin(); it != parents.end(); it++) - if ((*it)->remove_expression_tree_reference()) + if ((*it)->remove_expression_reference(true/*expr tree*/)) delete (*it); } } @@ -8669,54 +8648,33 @@ namespace Legion { } //-------------------------------------------------------------------------- - void IndexSpaceNode::add_expression_reference( - std::set &applied_events, unsigned count) + void IndexSpaceNode::add_expression_reference(bool expr_tree) //-------------------------------------------------------------------------- { - WrapperReferenceMutator mutator(applied_events); - add_expression_reference(&mutator, count); - } - - //-------------------------------------------------------------------------- - void IndexSpaceNode::add_expression_reference( - ReferenceMutator *mutator, unsigned count) - //-------------------------------------------------------------------------- - { - if (mutator == NULL) + if (!expr_tree) { - LocalReferenceMutator local_mutator; - add_base_gc_ref(IS_EXPR_REF, &local_mutator, count); + LocalReferenceMutator mutator; + add_base_valid_ref(IS_EXPR_REF, &mutator); } else - add_base_gc_ref(IS_EXPR_REF, mutator, count); + add_base_resource_ref(IS_EXPR_REF); } //-------------------------------------------------------------------------- - bool IndexSpaceNode::remove_expression_reference(unsigned count) + bool IndexSpaceNode::remove_expression_reference(bool expr_tree) //-------------------------------------------------------------------------- { - return remove_base_gc_ref(IS_EXPR_REF, NULL/*mutator*/, count); - } - - //-------------------------------------------------------------------------- - void IndexSpaceNode::add_expression_tree_reference(unsigned count) - //-------------------------------------------------------------------------- - { - add_base_resource_ref(IS_EXPR_REF, count); - } - - //-------------------------------------------------------------------------- - bool IndexSpaceNode::remove_expression_tree_reference(unsigned count) - //-------------------------------------------------------------------------- - { - return remove_base_resource_ref(IS_EXPR_REF, count); + if (expr_tree) + return remove_base_resource_ref(IS_EXPR_REF); + else + return remove_base_valid_ref(IS_EXPR_REF); } //-------------------------------------------------------------------------- bool IndexSpaceNode::remove_operation(RegionTreeForest *forest) //-------------------------------------------------------------------------- { - return remove_expression_tree_reference(); + return remove_expression_reference(true/*expr tree*/); } //-------------------------------------------------------------------------- diff --git a/runtime/legion/region_tree.h b/runtime/legion/region_tree.h index 2abce33a6f..74c971a145 100644 --- a/runtime/legion/region_tree.h +++ b/runtime/legion/region_tree.h @@ -1028,7 +1028,7 @@ namespace Legion { TightenIndexSpaceArgs(IndexSpaceExpression *proxy) : LgTaskArgs(implicit_provenance), proxy_this(proxy) - { proxy->add_expression_tree_reference(); } + { proxy->add_expression_reference(true/*tree only*/); } public: IndexSpaceExpression *const proxy_this; }; @@ -1091,17 +1091,10 @@ namespace Legion { virtual void pack_expression(Serializer &rez, AddressSpaceID target) = 0; virtual void pack_expression_value(Serializer &rez, AddressSpaceID target) = 0; - public: virtual bool try_add_canonical_reference(void) = 0; virtual bool remove_canonical_reference(void) = 0; - virtual void add_expression_reference(std::set &applied_events, - unsigned count = 1) = 0; - virtual void add_expression_reference(ReferenceMutator *mutator = NULL, - unsigned count = 1) = 0; - virtual bool remove_expression_reference(unsigned count = 1) = 0; - virtual void add_expression_tree_reference(unsigned count = 1) = 0; - virtual bool remove_expression_tree_reference(unsigned count = 1) = 0; - public: + virtual void add_expression_reference(bool expr_tree = false) = 0; + virtual bool remove_expression_reference(bool expr_tree = false) = 0; virtual bool remove_operation(RegionTreeForest *forest) = 0; virtual IndexSpaceNode* create_node(IndexSpace handle, DistributedID did, RtEvent initialized, @@ -1345,17 +1338,10 @@ namespace Legion { virtual void pack_expression(Serializer &rez, AddressSpaceID target) = 0; virtual void pack_expression_value(Serializer &rez, AddressSpaceID target) = 0; - public: virtual bool try_add_canonical_reference(void); virtual bool remove_canonical_reference(void); - virtual void add_expression_reference(std::set &applied_events, - unsigned count = 1); - virtual void add_expression_reference(ReferenceMutator *mutator = NULL, - unsigned count = 1); - virtual bool remove_expression_reference(unsigned count = 1); - virtual void add_expression_tree_reference(unsigned count = 1); - virtual bool remove_expression_tree_reference(unsigned count = 1); - public: + virtual void add_expression_reference(bool expr_tree = false); + virtual bool remove_expression_reference(bool expr_tree = false); virtual bool remove_operation(RegionTreeForest *forest) = 0; virtual IndexSpaceNode* create_node(IndexSpace handle, DistributedID did, RtEvent initialized, @@ -1960,17 +1946,10 @@ namespace Legion { virtual bool check_empty(void) = 0; virtual void pack_expression(Serializer &rez, AddressSpaceID target); virtual void pack_expression_value(Serializer &rez,AddressSpaceID target); - public: virtual bool try_add_canonical_reference(void); virtual bool remove_canonical_reference(void); - virtual void add_expression_reference(std::set &applied_events, - unsigned count = 1); - virtual void add_expression_reference(ReferenceMutator *mutator = NULL, - unsigned count = 1); - virtual bool remove_expression_reference(unsigned count = 1); - virtual void add_expression_tree_reference(unsigned count = 1); - virtual bool remove_expression_tree_reference(unsigned count = 1); - public: + virtual void add_expression_reference(bool expr_tree = false); + virtual bool remove_expression_reference(bool expr_tree = false); virtual bool remove_operation(RegionTreeForest *forest); virtual IndexSpaceNode* create_node(IndexSpace handle, DistributedID did, RtEvent initialized, diff --git a/runtime/legion/region_tree.inl b/runtime/legion/region_tree.inl index 3e3e3e2838..380b3145bb 100644 --- a/runtime/legion/region_tree.inl +++ b/runtime/legion/region_tree.inl @@ -1876,7 +1876,7 @@ namespace Legion { #endif // Add the parent and the reference sub->add_parent_operation(this); - sub->add_expression_tree_reference(); + sub->add_expression_reference(true/*expr tree*/); // Then get the realm index space expression ApEvent precondition = sub->get_expr_index_space( &spaces[idx], this->type_tag, false/*need tight result*/); @@ -1946,7 +1946,7 @@ namespace Legion { { // Remove references from our sub expressions for (unsigned idx = 0; idx < sub_expressions.size(); idx++) - if (sub_expressions[idx]->remove_expression_tree_reference()) + if (sub_expressions[idx]->remove_expression_reference(true/*exprtree*/)) delete sub_expressions[idx]; } @@ -1997,7 +1997,7 @@ namespace Legion { forest->remove_union_operation(this, sub_expressions); // Remove our expression reference added by invalidate_operation // and return true if we should be deleted - return this->remove_expression_tree_reference(); + return this->remove_expression_reference(true/*expr tree*/); } //-------------------------------------------------------------------------- @@ -2019,7 +2019,7 @@ namespace Legion { #endif // Add the parent and the reference sub->add_parent_operation(this); - sub->add_expression_tree_reference(); + sub->add_expression_reference(true/*expr tree*/); ApEvent precondition = sub->get_expr_index_space( &spaces[idx], this->type_tag, false/*need tight result*/); if (precondition.exists()) @@ -2089,7 +2089,7 @@ namespace Legion { { // Remove references from our sub expressions for (unsigned idx = 0; idx < sub_expressions.size(); idx++) - if (sub_expressions[idx]->remove_expression_tree_reference()) + if (sub_expressions[idx]->remove_expression_reference(true/*exprtree*/)) delete sub_expressions[idx]; } @@ -2141,7 +2141,7 @@ namespace Legion { forest->remove_intersection_operation(this, sub_expressions); // Remove our expression reference added by invalidate_operation // and return true if we should be deleted - return this->remove_expression_tree_reference(); + return this->remove_expression_reference(true/*expr tree*/); } //-------------------------------------------------------------------------- @@ -2160,7 +2160,7 @@ namespace Legion { { // Special case for when the expressions are the same lhs->add_parent_operation(this); - lhs->add_expression_tree_reference(); + lhs->add_expression_reference(true/*expr tree*/); this->realm_index_space = Realm::IndexSpace::make_empty(); this->tight_index_space = Realm::IndexSpace::make_empty(); this->realm_index_space_ready = ApEvent::NO_AP_EVENT; @@ -2172,8 +2172,8 @@ namespace Legion { // Add the parent and the references lhs->add_parent_operation(this); rhs->add_parent_operation(this); - lhs->add_expression_tree_reference(); - rhs->add_expression_tree_reference(); + lhs->add_expression_reference(true/*expr tree*/); + rhs->add_expression_reference(true/*expr tree*/); ApEvent left_ready = lhs->get_expr_index_space(&lhs_space, this->type_tag, false/*tight*/); ApEvent right_ready = @@ -2239,9 +2239,9 @@ namespace Legion { //-------------------------------------------------------------------------- { if ((rhs != NULL) && (lhs != rhs) && - rhs->remove_expression_tree_reference()) + rhs->remove_expression_reference(true/*expr tree*/)) delete rhs; - if ((lhs != NULL) && lhs->remove_expression_tree_reference()) + if ((lhs != NULL) && lhs->remove_expression_reference(true/*expr tree*/)) delete lhs; } @@ -2294,7 +2294,7 @@ namespace Legion { forest->remove_subtraction_operation(this, lhs, rhs); // Remove our expression reference added by invalidate_operation // and return true if we should be deleted - return this->remove_expression_tree_reference(); + return this->remove_expression_reference(true/*expr tree*/); } ///////////////////////////////////////////////////////////// From fefc879fcb85ac41551e3228f850d93dd32470fa Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Tue, 7 Dec 2021 10:24:17 -0800 Subject: [PATCH 09/36] realm: fix flow control for multi-hop reductions --- runtime/realm/cuda/cuda_internal.cc | 14 ++++++++++++-- runtime/realm/transfer/channel.cc | 14 ++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/runtime/realm/cuda/cuda_internal.cc b/runtime/realm/cuda/cuda_internal.cc index 4f3a513b94..6e694275d7 100644 --- a/runtime/realm/cuda/cuda_internal.cc +++ b/runtime/realm/cuda/cuda_internal.cc @@ -1131,12 +1131,22 @@ namespace Realm { } else { max_elems = std::min(input_control.remaining_count / in_elem_size, output_control.remaining_count / out_elem_size); - if(in_port != 0) + if(in_port != 0) { max_elems = std::min(max_elems, in_port->addrlist.bytes_pending() / in_elem_size); - if(out_port != 0) + if(in_port->peer_guid != XFERDES_NO_GUID) { + size_t read_bytes_avail = in_port->seq_remote.span_exists(in_port->local_bytes_total, + (max_elems * in_elem_size)); + max_elems = std::min(max_elems, + (read_bytes_avail / in_elem_size)); + } + } + if(out_port != 0) { max_elems = std::min(max_elems, out_port->addrlist.bytes_pending() / out_elem_size); + // no support for reducing into an intermediate buffer + assert(out_port->peer_guid == XFERDES_NO_GUID); + } } size_t total_elems = 0; diff --git a/runtime/realm/transfer/channel.cc b/runtime/realm/transfer/channel.cc index c23856fee7..9ec0e095f1 100644 --- a/runtime/realm/transfer/channel.cc +++ b/runtime/realm/transfer/channel.cc @@ -3430,12 +3430,22 @@ namespace Realm { } else { max_elems = std::min(input_control.remaining_count / in_elem_size, output_control.remaining_count / out_elem_size); - if(in_port != 0) + if(in_port != 0) { max_elems = std::min(max_elems, in_port->addrlist.bytes_pending() / in_elem_size); - if(out_port != 0) + if(in_port->peer_guid != XFERDES_NO_GUID) { + size_t read_bytes_avail = in_port->seq_remote.span_exists(in_port->local_bytes_total, + (max_elems * in_elem_size)); + max_elems = std::min(max_elems, + (read_bytes_avail / in_elem_size)); + } + } + if(out_port != 0) { max_elems = std::min(max_elems, out_port->addrlist.bytes_pending() / out_elem_size); + // no support for reducing into an intermediate buffer + assert(out_port->peer_guid == XFERDES_NO_GUID); + } } size_t total_elems = 0; From 522d2c37709cbadee418c978f7f2d2d702f75777 Mon Sep 17 00:00:00 2001 From: Manolis Papadakis Date: Fri, 10 Dec 2021 06:21:49 +0000 Subject: [PATCH 10/36] Fix dependency generation for .cu files --- runtime/runtime.mk | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/runtime/runtime.mk b/runtime/runtime.mk index 9bd70fae87..1496b00b44 100644 --- a/runtime/runtime.mk +++ b/runtime/runtime.mk @@ -1310,19 +1310,19 @@ $(MAPPER_OBJS) : %.cc.o : %.cc $(LEGION_DEFINES_HEADER) $(REALM_DEFINES_HEADER) ifeq ($(strip $(MK_HIP_TARGET)),ROCM) $(filter %.cpp.o,$(APP_OBJS)) : %.cpp.o : %.cpp $(LEGION_DEFINES_HEADER) $(REALM_DEFINES_HEADER) - $(HIPCC) -o $<.d -M $< $(HIPCC_FLAGS) $(INC_FLAGS) + $(HIPCC) -o $<.d -M -MT $@ $< $(HIPCC_FLAGS) $(INC_FLAGS) $(HIPCC) -o $@ -c $< $(HIPCC_FLAGS) $(INC_FLAGS) endif ifeq ($(strip $(MK_HIP_TARGET)),CUDA) $(filter %.cu.o,$(APP_OBJS)) : %.cu.o : %.cu $(LEGION_DEFINES_HEADER) $(REALM_DEFINES_HEADER) - $(HIPCC) -o $<.d -M $< $(HIPCC_FLAGS) $(INC_FLAGS) + $(HIPCC) -o $<.d -M -MT $@ $< $(HIPCC_FLAGS) $(INC_FLAGS) $(HIPCC) -o $@ -c $< $(HIPCC_FLAGS) $(INC_FLAGS) endif ifeq ($(strip $(USE_CUDA)),1) $(filter %.cu.o,$(APP_OBJS)) : %.cu.o : %.cu $(LEGION_DEFINES_HEADER) $(REALM_DEFINES_HEADER) - $(NVCC) -o $<.d -M $< $(NVCC_FLAGS) $(INC_FLAGS) + $(NVCC) -o $<.d -M -MT $@ $< $(NVCC_FLAGS) $(INC_FLAGS) $(NVCC) -o $@ -c $< $(NVCC_FLAGS) $(INC_FLAGS) endif @@ -1330,19 +1330,19 @@ ifeq ($(strip $(MK_HIP_TARGET)),ROCM) $(filter %.cpp,$(LEGION_HIP_SRC)): %.cpp : %.cu hipify-perl $< > $@ $(filter %.cpp.o,$(LEGION_OBJS)): %.cpp.o : %.cpp $(LEGION_DEFINES_HEADER) $(REALM_DEFINES_HEADER) - $(HIPCC) -o $<.d -M $< $(HIPCC_FLAGS) $(INC_FLAGS) + $(HIPCC) -o $<.d -M -MT $@ $< $(HIPCC_FLAGS) $(INC_FLAGS) $(HIPCC) -o $@ -c $< $(HIPCC_FLAGS) $(INC_FLAGS) endif ifeq ($(strip $(MK_HIP_TARGET)),CUDA) $(filter %.cu.o,$(LEGION_OBJS)): %.cu.o : %.cu $(LEGION_DEFINES_HEADER) $(REALM_DEFINES_HEADER) - $(HIPCC) -o $<.d -M $< $(HIPCC_FLAGS) $(INC_FLAGS) + $(HIPCC) -o $<.d -M -MT $@ $< $(HIPCC_FLAGS) $(INC_FLAGS) $(HIPCC) -o $@ -c $< $(HIPCC_FLAGS) $(INC_FLAGS) endif ifeq ($(strip $(USE_CUDA)),1) $(filter %.cu.o,$(LEGION_OBJS)): %.cu.o : %.cu $(LEGION_DEFINES_HEADER) $(REALM_DEFINES_HEADER) - $(NVCC) -o $<.d -M $< $(NVCC_FLAGS) $(INC_FLAGS) + $(NVCC) -o $<.d -M -MT $@ $< $(NVCC_FLAGS) $(INC_FLAGS) $(NVCC) -o $@ -c $< $(NVCC_FLAGS) $(INC_FLAGS) endif From 58082fd554dbcdfa91bc8f851e4944d9038cce3b Mon Sep 17 00:00:00 2001 From: Mike Bauer Date: Fri, 10 Dec 2021 01:17:45 -0800 Subject: [PATCH 11/36] legion: fixes for reference counting of index space expressions --- runtime/legion/garbage_collection.cc | 166 +++++ runtime/legion/garbage_collection.h | 54 +- runtime/legion/legion_analysis.cc | 86 ++- runtime/legion/legion_analysis.h | 5 +- runtime/legion/legion_context.h | 14 + runtime/legion/legion_instances.cc | 8 +- runtime/legion/legion_types.h | 103 ++- runtime/legion/region_tree.cc | 1016 ++++++++++++++++++-------- runtime/legion/region_tree.h | 230 ++++-- runtime/legion/region_tree.inl | 131 ++-- runtime/legion/runtime.cc | 89 ++- 11 files changed, 1386 insertions(+), 516 deletions(-) diff --git a/runtime/legion/garbage_collection.cc b/runtime/legion/garbage_collection.cc index f9b909493c..840c685068 100644 --- a/runtime/legion/garbage_collection.cc +++ b/runtime/legion/garbage_collection.cc @@ -447,6 +447,172 @@ namespace Legion { return do_deletion; } + //-------------------------------------------------------------------------- + bool DistributedCollectable::check_valid_and_increment( + ReferenceSource source, int cnt) + //-------------------------------------------------------------------------- + { + AutoLock gc(gc_lock); + if (current_state != VALID_STATE) + return false; +#ifdef DEBUG_LEGION + assert(cnt >= 0); +#endif +#ifdef LEGION_GC + log_base_ref(VALID_REF_KIND, did, local_space, source, cnt); +#endif +#ifndef DEBUG_LEGION_GC + int previous = __sync_fetch_and_add(&valid_references, cnt); +#ifdef DEBUG_LEGION + assert(previous >= 0); +#endif + if (previous == 0) + has_valid_references = true; +#else + valid_references++; + std::map::iterator finder = + detailed_base_valid_references.find(source); + if (finder == detailed_base_valid_references.end()) + detailed_base_valid_references[source] = cnt; + else + finder->second += cnt; + if (valid_references > cnt) + return true; +#ifdef DEBUG_LEGION + assert(!has_valid_references); +#endif + has_valid_references = true; +#endif + return true; + } + + //-------------------------------------------------------------------------- + bool DistributedCollectable::check_valid_and_increment( + DistributedID source, int cnt) + //-------------------------------------------------------------------------- + { + AutoLock gc(gc_lock); + if (current_state != VALID_STATE) + return false; +#ifdef DEBUG_LEGION + assert(cnt >= 0); +#endif +#ifdef LEGION_GC + log_nested_ref(VALID_REF_KIND, did, local_space, source, cnt); +#endif +#ifndef DEBUG_LEGION_GC + int previous = __sync_fetch_and_add(&valid_references, cnt); +#ifdef DEBUG_LEGION + assert(previous >= 0); +#endif + if (previous == 0) + has_valid_references = true; +#else + valid_references++; + source = LEGION_DISTRIBUTED_ID_FILTER(source); + std::map::iterator finder = + detailed_nested_valid_references.find(source); + if (finder == detailed_nested_valid_references.end()) + detailed_nested_valid_references[source] = cnt; + else + finder->second += cnt; + if (valid_references > cnt) + return true; +#ifdef DEBUG_LEGION + assert(!has_valid_references); +#endif + has_valid_references = true; +#endif + return true; + } + + //-------------------------------------------------------------------------- + bool DistributedCollectable::check_gc_and_increment( + ReferenceSource source, int cnt) + //-------------------------------------------------------------------------- + { + AutoLock gc(gc_lock); + if ((current_state == INACTIVE_STATE) || + (current_state == DELETED_STATE) || + (current_state == PENDING_ACTIVE_STATE) || + (current_state == PENDING_INACTIVE_STATE) || + (current_state == PENDING_INACTIVE_INVALID_STATE)) + return false; +#ifdef DEBUG_LEGION + assert(cnt >= 0); +#endif +#ifdef LEGION_GC + log_base_ref(GC_REF_KIND, did, local_space, source, cnt); +#endif +#ifndef DEBUG_LEGION_GC + int previous = __sync_fetch_and_add(&gc_references, cnt); +#ifdef DEBUG_LEGION + assert(previous >= 0); +#endif + if (previous == 0) + has_gc_references = true; +#else + gc_references++; + std::map::iterator finder = + detailed_base_gc_references.find(source); + if (finder == detailed_base_gc_references.end()) + detailed_base_gc_references[source] = cnt; + else + finder->second += cnt; + if (gc_references > cnt) + return true; +#ifdef DEBUG_LEGION + assert(!has_gc_references); +#endif + has_gc_references = true; +#endif + return true; + } + + //-------------------------------------------------------------------------- + bool DistributedCollectable::check_gc_and_increment( + DistributedID source, int cnt) + //-------------------------------------------------------------------------- + { + AutoLock gc(gc_lock); + if ((current_state == INACTIVE_STATE) || + (current_state == DELETED_STATE) || + (current_state == PENDING_ACTIVE_STATE) || + (current_state == PENDING_INACTIVE_STATE) || + (current_state == PENDING_INACTIVE_INVALID_STATE)) + return false; +#ifdef DEBUG_LEGION + assert(cnt >= 0); +#endif +#ifdef LEGION_GC + log_nested_ref(GC_REF_KIND, did, local_space, source, cnt); +#endif +#ifndef DEBUG_LEGION_GC + int previous = __sync_fetch_and_add(&gc_references, cnt); +#ifdef DEBUG_LEGION + assert(previous >= 0); +#endif + if (previous == 0) + has_gc_references = true; +#else + gc_references++; + source = LEGION_DISTRIBUTED_ID_FILTER(source); + std::map::iterator finder = + detailed_nested_gc_references.find(source); + if (finder == detailed_nested_gc_references.end()) + detailed_nested_gc_references[source] = cnt; + else + finder->second += cnt; + if (gc_references > cnt) + return true; +#ifdef DEBUG_LEGION + assert(!has_gc_references); +#endif + has_gc_references = true; +#endif + return true; + } + //-------------------------------------------------------------------------- bool DistributedCollectable::check_resource_and_increment( ReferenceSource source, int cnt) diff --git a/runtime/legion/garbage_collection.h b/runtime/legion/garbage_collection.h index fd2f8d4a1c..7166e010ec 100644 --- a/runtime/legion/garbage_collection.h +++ b/runtime/legion/garbage_collection.h @@ -66,7 +66,7 @@ namespace Legion { REMOTE_DID_REF = 7, PENDING_COLLECTIVE_REF = 8, MEMORY_MANAGER_REF = 9, - COMPOSITE_NODE_REF = 10, + INSTANCE_BUILDER_REF = 10, FIELD_ALLOCATOR_REF = 11, REMOTE_CREATE_REF = 12, INSTANCE_MAPPER_REF = 13, @@ -75,15 +75,15 @@ namespace Legion { NEVER_GC_REF = 16, CONTEXT_REF = 17, RESTRICTED_REF = 18, - VERSION_STATE_TREE_REF = 19, + META_TASK_REF = 19, PHYSICAL_USER_REF = 20, LOGICAL_VIEW_REF = 21, REGION_TREE_REF = 22, LAYOUT_DESC_REF = 23, RUNTIME_REF = 24, - IS_EXPR_REF = 25, + LIVE_EXPR_REF = 25, TRACE_REF = 26, - AGGREGATORE_REF = 27, + AGGREGATOR_REF = 27, FIELD_STATE_REF = 28, LAST_SOURCE_REF = 29, }; @@ -106,7 +106,7 @@ namespace Legion { "Remote Distributed ID Reference", \ "Pending Collective Reference", \ "Memory Manager Reference", \ - "Composite Node Reference", \ + "Instance Builder Reference", \ "Field Allocator Reference", \ "Remote Creation Reference", \ "Instance Mapper Reference", \ @@ -115,13 +115,13 @@ namespace Legion { "Never GC Reference", \ "Context Reference", \ "Restricted Reference", \ - "Version State Tree Reference", \ + "Meta-Task Reference", \ "Physical User Reference", \ "Logical View Reference", \ "Region Tree Reference", \ "Layout Description Reference", \ "Runtime Reference", \ - "Index Space Expression Reference", \ + "Live Index Space Expression Reference", \ "Physical Trace Reference", \ "Aggregator Reference", \ "Field State Reference", \ @@ -331,8 +331,14 @@ namespace Legion { inline bool remove_base_resource_ref(ReferenceSource source, int cnt = 1); inline bool remove_nested_resource_ref(DistributedID source, int cnt = 1); public: +#ifdef DEBUG_LEGION + bool check_valid(void) const { return (current_state == VALID_STATE); } +#endif // Atomic check and increment operations - inline bool check_valid_and_increment(ReferenceSource source,int cnt = 1); + bool check_valid_and_increment(ReferenceSource source,int cnt = 1); + bool check_valid_and_increment(DistributedID source, int cnt = 1); + bool check_gc_and_increment(ReferenceSource source, int cnt = 1); + bool check_gc_and_increment(DistributedID source, int cnt = 1); bool check_resource_and_increment(ReferenceSource source ,int cnt = 1); bool check_resource_and_increment(DistributedID source, int cnt = 1); private: @@ -863,38 +869,6 @@ namespace Legion { #endif } - //-------------------------------------------------------------------------- - inline bool DistributedCollectable::check_valid_and_increment( - ReferenceSource source, int cnt /*=1*/) - //-------------------------------------------------------------------------- - { -#ifdef DEBUG_LEGION - assert(cnt >= 0); -#endif - // Don't support this if we are debugging GC -#ifndef DEBUG_LEGION_GC - // Read the value in an unsafe way at first - int current_cnt = valid_references; - // Don't even both trying if the count is zero - while (current_cnt > 0) - { - const int next_cnt = current_cnt + cnt; - const int prev_cnt = - __sync_val_compare_and_swap(&valid_references, current_cnt, next_cnt); - if (prev_cnt == current_cnt) - { -#ifdef LEGION_GC - log_base_ref(VALID_REF_KIND, did, local_space, source, cnt); -#endif - return true; - } - // Update the current count - current_cnt = prev_cnt; - } -#endif - return false; - } - }; // namespace Internal }; // namespace Legion diff --git a/runtime/legion/legion_analysis.cc b/runtime/legion/legion_analysis.cc index 09a6d12949..90d8440167 100644 --- a/runtime/legion/legion_analysis.cc +++ b/runtime/legion/legion_analysis.cc @@ -3973,8 +3973,41 @@ namespace Legion { // Remove references from any views that we have for (std::set::const_iterator it = all_views.begin(); it != all_views.end(); it++) - if ((*it)->remove_base_valid_ref(AGGREGATORE_REF)) + if ((*it)->remove_base_valid_ref(AGGREGATOR_REF)) delete (*it); + all_views.clear(); + // Remove source precondition expression references + for (std::map::iterator vit = + src_pre.begin(); vit != src_pre.end(); vit++) + { + for (EventFieldExprs::iterator eit = + vit->second.begin(); eit != vit->second.end(); eit++) + { + for (FieldMaskSet::iterator it = + eit->second.begin(); it != eit->second.end(); it++) + if (it->first->remove_base_expression_reference(AGGREGATOR_REF)) + delete it->first; + eit->second.clear(); + } + vit->second.clear(); + } + src_pre.clear(); + // Remove destination precondition expression references + for (std::map::iterator vit = + dst_pre.begin(); vit != dst_pre.end(); vit++) + { + for (EventFieldExprs::iterator eit = + vit->second.begin(); eit != vit->second.end(); eit++) + { + for (FieldMaskSet::iterator it = + eit->second.begin(); it != eit->second.end(); it++) + if (it->first->remove_base_expression_reference(AGGREGATOR_REF)) + delete it->first; + eit->second.clear(); + } + vit->second.clear(); + } + dst_pre.clear(); // Delete all our copy updates for (LegionMap >::aligned:: const_iterator mit = sources.begin(); mit != sources.end(); mit++) @@ -4014,6 +4047,23 @@ namespace Legion { return *this; } + //-------------------------------------------------------------------------- + CopyFillAggregator::Update::Update(IndexSpaceExpression *exp, + const FieldMask &mask, CopyAcrossHelper *helper) + : expr(exp), src_mask(mask), across_helper(helper) + //-------------------------------------------------------------------------- + { + expr->add_base_expression_reference(AGGREGATOR_REF); + } + + //-------------------------------------------------------------------------- + CopyFillAggregator::Update::~Update(void) + //-------------------------------------------------------------------------- + { + if (expr->remove_base_expression_reference(AGGREGATOR_REF)) + delete expr; + } + //-------------------------------------------------------------------------- void CopyFillAggregator::CopyUpdate::record_source_expressions( InstanceFieldExprs &src_exprs) const @@ -4458,6 +4508,7 @@ namespace Legion { #ifdef DEBUG_LEGION assert(!preconditions.empty()); #endif + WrapperReferenceMutator mutator(effects); AutoLock p_lock(pre_lock); EventFieldExprs &pre = reading ? src_pre[view] : dst_pre[view]; for (EventFieldExprs::iterator eit = preconditions.begin(); @@ -4473,13 +4524,23 @@ namespace Legion { FieldMaskSet::iterator finder = event_finder->second.find(it->first); if (finder == event_finder->second.end()) + { + // Keep a reference in case we are deferred + it->first->add_base_expression_reference(AGGREGATOR_REF,&mutator); event_finder->second.insert(it->first, it->second); + } else finder.merge(it->second); } } else // We can just swap this over + { + // Keep references in case we are deferred + for (FieldMaskSet::const_iterator it = + eit->second.begin(); it != eit->second.end(); it++) + it->first->add_base_expression_reference(AGGREGATOR_REF, &mutator); pre[eit->first].swap(eit->second); + } } } @@ -4496,7 +4557,12 @@ namespace Legion { FieldMaskSet::iterator finder = event_pre.find(expr); if (finder == event_pre.end()) + { event_pre.insert(expr, mask); + // Keep a reference in case we are deferred + WrapperReferenceMutator mutator(effects); + expr->add_base_expression_reference(AGGREGATOR_REF, &mutator); + } else finder.merge(mask); } @@ -4608,7 +4674,7 @@ namespace Legion { std::pair::iterator,bool> result = all_views.insert(new_view); if (result.second) - new_view->add_base_valid_ref(AGGREGATORE_REF, this); + new_view->add_base_valid_ref(AGGREGATOR_REF, this); } //-------------------------------------------------------------------------- @@ -9685,6 +9751,10 @@ namespace Legion { std::map *copy_exprs = new std::map(); copy_exprs->swap(to_traverse_exprs); + WrapperReferenceMutator mutator(done_events); + for (std::map::const_iterator + it = copy_exprs->begin(); it != copy_exprs->end(); it++) + it->second->add_base_expression_reference(META_TASK_REF, &mutator); const RtUserEvent done = Runtime::create_rt_user_event(); DeferRayTraceFinishArgs args(target, source, copy_traverse, copy_exprs, expr->get_volume(), handle, done); @@ -13865,7 +13935,7 @@ namespace Legion { //-------------------------------------------------------------------------- { if (local) - expr->add_base_expression_reference(IS_EXPR_REF); + expr->add_base_expression_reference(META_TASK_REF); } //-------------------------------------------------------------------------- @@ -13886,7 +13956,7 @@ namespace Legion { delete dargs->ray_mask; // Remove our expression reference too if (dargs->is_local && - dargs->expr->remove_base_expression_reference(IS_EXPR_REF)) + dargs->expr->remove_base_expression_reference(META_TASK_REF)) delete dargs->expr; } @@ -13918,6 +13988,10 @@ namespace Legion { Runtime::trigger_event(dargs->done, Runtime::merge_events(done_events)); else Runtime::trigger_event(dargs->done); + for (std::map::const_iterator it = + dargs->exprs->begin(); it != dargs->exprs->end(); it++) + if (it->second->remove_base_expression_reference(META_TASK_REF)) + delete it->second; delete dargs->to_traverse; delete dargs->exprs; } @@ -13983,7 +14057,7 @@ namespace Legion { //-------------------------------------------------------------------------- { if (is_local) - expr->add_base_expression_reference(IS_EXPR_REF); + expr->add_base_expression_reference(META_TASK_REF); } //-------------------------------------------------------------------------- @@ -14074,7 +14148,7 @@ namespace Legion { set->register_with_runtime(NULL/*no remote registration needed*/); // Remove our expression reference too if (dargs->is_local && - dargs->expr->remove_base_expression_reference(IS_EXPR_REF)) + dargs->expr->remove_base_expression_reference(META_TASK_REF)) delete dargs->expr; } diff --git a/runtime/legion/legion_analysis.h b/runtime/legion/legion_analysis.h index 01681648bc..58c2205b1e 100644 --- a/runtime/legion/legion_analysis.h +++ b/runtime/legion/legion_analysis.h @@ -1233,9 +1233,8 @@ namespace Legion { class Update { public: Update(IndexSpaceExpression *exp, const FieldMask &mask, - CopyAcrossHelper *helper) - : expr(exp), src_mask(mask), across_helper(helper) { } - virtual ~Update(void) { } + CopyAcrossHelper *helper); + virtual ~Update(void); public: virtual void record_source_expressions( InstanceFieldExprs &src_exprs) const = 0; diff --git a/runtime/legion/legion_context.h b/runtime/legion/legion_context.h index 3f8103aafa..ee2d2b16cb 100644 --- a/runtime/legion/legion_context.h +++ b/runtime/legion/legion_context.h @@ -2063,6 +2063,9 @@ namespace Legion { inline void TaskContext::begin_runtime_call(void) //-------------------------------------------------------------------------- { +#ifdef DEBUG_LEGION + assert(implicit_live_expressions == NULL); +#endif if (overhead_tracker == NULL) return; const long long current = Realm::Clock::current_time_in_nanoseconds(); @@ -2075,6 +2078,17 @@ namespace Legion { inline void TaskContext::end_runtime_call(void) //-------------------------------------------------------------------------- { + if (implicit_live_expressions != NULL) + { + // Remove references to any live index space expressions we have + for (std::vector::const_iterator it = + implicit_live_expressions->begin(); it != + implicit_live_expressions->end(); it++) + if ((*it)->remove_base_expression_reference(LIVE_EXPR_REF)) + delete (*it); + delete implicit_live_expressions; + implicit_live_expressions = NULL; + } if (overhead_tracker == NULL) return; const long long current = Realm::Clock::current_time_in_nanoseconds(); diff --git a/runtime/legion/legion_instances.cc b/runtime/legion/legion_instances.cc index cbf204d642..f6ccf78777 100644 --- a/runtime/legion/legion_instances.cc +++ b/runtime/legion/legion_instances.cc @@ -1592,7 +1592,7 @@ namespace Legion { //-------------------------------------------------------------------------- { if (local_is) - local_expr->add_base_expression_reference(IS_EXPR_REF); + local_expr->add_base_expression_reference(META_TASK_REF); } //-------------------------------------------------------------------------- @@ -1614,7 +1614,7 @@ namespace Legion { dargs->use_event, dargs->redop, dargs->shadow_instance); // Remove the local expression reference if necessary if (dargs->local_is && - dargs->local_expr->remove_base_expression_reference(IS_EXPR_REF)) + dargs->local_expr->remove_base_expression_reference(META_TASK_REF)) delete dargs->local_expr; } @@ -3022,7 +3022,7 @@ namespace Legion { //-------------------------------------------------------------------------- { if (local_is) - local_expr->add_base_expression_reference(IS_EXPR_REF); + local_expr->add_base_expression_reference(META_TASK_REF); } //-------------------------------------------------------------------------- @@ -3046,7 +3046,7 @@ namespace Legion { dargs->use_event, dargs->redop); // Remove the local expression reference if necessary if (dargs->local_is && - dargs->local_expr->remove_base_expression_reference(IS_EXPR_REF)) + dargs->local_expr->remove_base_expression_reference(META_TASK_REF)) delete dargs->local_expr; } diff --git a/runtime/legion/legion_types.h b/runtime/legion/legion_types.h index ab60882c1f..d152d4f767 100644 --- a/runtime/legion/legion_types.h +++ b/runtime/legion/legion_types.h @@ -1480,43 +1480,7 @@ namespace Legion { class InnerContext;; class TopLevelContext; class RemoteContext; - class LeafContext; - - // Nasty global variable for TLS support of figuring out - // our context implicitly - extern __thread TaskContext *implicit_context; - // Same thing for the runtime - extern __thread Runtime *implicit_runtime; - // Another nasty global variable for tracking the fast - // reservations that we are holding - extern __thread AutoLock *local_lock_list; - // One more nasty global variable that we use for tracking - // the provenance of meta-task operations for profiling - // purposes, this has no bearing on correctness - extern __thread ::legion_unique_id_t implicit_provenance; - // Use this to track if we're inside of a registration - // callback function which we know to be deduplicated - enum RegistrationCallbackMode { - NO_REGISTRATION_CALLBACK = 0, - LOCAL_REGISTRATION_CALLBACK = 1, - GLOBAL_REGISTRATION_CALLBACK = 2, - }; - extern __thread unsigned inside_registration_callback; - - /** - * \class LgTaskArgs - * The base class for all Legion Task arguments - */ - template - struct LgTaskArgs { - public: - LgTaskArgs(::legion_unique_id_t uid) - : provenance(uid), lg_task_id(T::TASK_ID) { } - public: - // In this order for alignment reasons - const ::legion_unique_id_t provenance; - const LgTaskID lg_task_id; - }; + class LeafContext; // legion_trace.h class LegionTrace; @@ -1554,6 +1518,7 @@ namespace Legion { class RegionTreeForest; class CopyIndirection; class IndexSpaceExpression; + class IndexSpaceExprRef; class IndexSpaceOperation; template class IndexSpaceOperationT; template class IndexSpaceUnion; @@ -1641,6 +1606,48 @@ namespace Legion { typedef Mapping::MapperEvent MapperEvent; typedef Mapping::ProfilingMeasurementID ProfilingMeasurementID; + // Nasty global variable for TLS support of figuring out + // our context implicitly + extern __thread TaskContext *implicit_context; + // Same thing for the runtime + extern __thread Runtime *implicit_runtime; + // Another nasty global variable for tracking the fast + // reservations that we are holding + extern __thread AutoLock *local_lock_list; + // One more nasty global variable that we use for tracking + // the provenance of meta-task operations for profiling + // purposes, this has no bearing on correctness + extern __thread ::legion_unique_id_t implicit_provenance; + // Use this to track if we're inside of a registration + // callback function which we know to be deduplicated + enum RegistrationCallbackMode { + NO_REGISTRATION_CALLBACK = 0, + LOCAL_REGISTRATION_CALLBACK = 1, + GLOBAL_REGISTRATION_CALLBACK = 2, + }; + extern __thread unsigned inside_registration_callback; + // This data structure tracks references to any live + // temporary index space expressions that have been + // handed back by the region tree inside the execution + // of a meta-task or a runtime API call + extern __thread + std::vector *implicit_live_expressions; + + /** + * \class LgTaskArgs + * The base class for all Legion Task arguments + */ + template + struct LgTaskArgs { + public: + LgTaskArgs(::legion_unique_id_t uid) + : provenance(uid), lg_task_id(T::TASK_ID) { } + public: + // In this order for alignment reasons + const ::legion_unique_id_t provenance; + const LgTaskID lg_task_id; + }; + #define FRIEND_ALL_RUNTIME_CLASSES \ friend class Legion::Runtime; \ friend class Internal::Runtime; \ @@ -2377,6 +2384,12 @@ namespace Legion { UniqueID local_provenance = Internal::implicit_provenance; // Save whether we are in a registration callback unsigned local_callback = Internal::inside_registration_callback; + // Save any local live expressions that we have + std::vector *local_live_expressions = + Internal::implicit_live_expressions; +#ifdef DEBUG_LEGION + Internal::implicit_live_expressions = NULL; +#endif // Check to see if we have any local locks to notify if (Internal::local_lock_list != NULL) { @@ -2415,6 +2428,11 @@ namespace Legion { Internal::implicit_provenance = local_provenance; // Write the registration callback information back Internal::inside_registration_callback = local_callback; +#ifdef DEBUG_LEGION + assert(Internal::implicit_live_expressions == NULL); +#endif + // Write the local live expressions back + Internal::implicit_live_expressions = local_live_expressions; } //-------------------------------------------------------------------------- @@ -2427,6 +2445,12 @@ namespace Legion { UniqueID local_provenance = Internal::implicit_provenance; // Save whether we are in a registration callback unsigned local_callback = Internal::inside_registration_callback; + // Save any local live expressions that we have + std::vector *local_live_expressions = + Internal::implicit_live_expressions; +#ifdef DEBUG_LEGION + Internal::implicit_live_expressions = NULL; +#endif // Check to see if we have any local locks to notify if (Internal::local_lock_list != NULL) { @@ -2465,6 +2489,11 @@ namespace Legion { Internal::implicit_provenance = local_provenance; // Write the registration callback information back Internal::inside_registration_callback = local_callback; +#ifdef DEBUG_LEGION + assert(Internal::implicit_live_expressions == NULL); +#endif + // Write the local live expressions back + Internal::implicit_live_expressions = local_live_expressions; } #ifdef LEGION_SPY diff --git a/runtime/legion/region_tree.cc b/runtime/legion/region_tree.cc index 7663bd3ee1..249beb85c8 100644 --- a/runtime/legion/region_tree.cc +++ b/runtime/legion/region_tree.cc @@ -2308,7 +2308,8 @@ namespace Legion { return result; } else - return intersect->issue_copy(trace_info, dst_fields, src_fields, + return intersect->issue_copy(trace_info, + dst_fields, src_fields, #ifdef LEGION_SPY src_req.region.get_tree_id(), dst_req.region.get_tree_id(), @@ -2616,7 +2617,7 @@ namespace Legion { RegionNode *src_node = get_node(src_req.region); RegionNode *idx_node = get_node(idx_req.region); RegionNode *dst_node = get_node(dst_req.region); - IndexSpaceExpression *copy_expr = + IndexSpaceExpression *copy_expr = (idx_node->row_source == src_node->row_source) ? idx_node->row_source : intersect_index_spaces(src_node->row_source, idx_node->row_source); // Easy out if we're not going to move anything @@ -2796,8 +2797,8 @@ namespace Legion { RegionNode *src_idx_node = get_node(src_idx_req.region); RegionNode *dst_node = get_node(dst_req.region); RegionNode *dst_idx_node = get_node(dst_idx_req.region); - IndexSpaceExpression *copy_expr = - (src_idx_node->row_source == dst_idx_node->row_source) ? + IndexSpaceExpression *copy_expr = + (src_idx_node->row_source == dst_idx_node->row_source) ? src_idx_node->row_source : intersect_index_spaces( src_idx_node->row_source, dst_idx_node->row_source); // Quick out if there is nothing we're going to copy @@ -5596,43 +5597,66 @@ namespace Legion { //-------------------------------------------------------------------------- IndexSpaceExpression* RegionTreeForest::union_index_spaces( - IndexSpaceExpression *lhs, IndexSpaceExpression *rhs) + IndexSpaceExpression *lhs, IndexSpaceExpression *rhs, + ReferenceMutator *mutator) //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION assert(lhs->type_tag == rhs->type_tag); + assert(lhs->is_valid()); + assert(rhs->is_valid()); #endif if (lhs == rhs) return lhs; if (lhs->is_empty()) return rhs; - lhs = lhs->get_canonical_expression(this); if (rhs->is_empty()) return lhs; - rhs = rhs->get_canonical_expression(this); - if (lhs == rhs) + IndexSpaceExpression *lhs_canon = lhs->get_canonical_expression(this); + IndexSpaceExpression *rhs_canon = rhs->get_canonical_expression(this); + if (lhs_canon == rhs_canon) return lhs; std::vector exprs(2); - if (compare_expressions(lhs, rhs)) + if (compare_expressions(lhs_canon, rhs_canon)) { - exprs[0] = lhs; - exprs[1] = rhs; + exprs[0] = lhs_canon; + exprs[1] = rhs_canon; } else { - exprs[0] = rhs; - exprs[1] = lhs; + exprs[0] = rhs_canon; + exprs[1] = lhs_canon; } - return union_index_spaces(exprs); + IndexSpaceExpression *result = union_index_spaces(exprs); + // Add the live reference + if (mutator == NULL) + { + LocalReferenceMutator local_mutator; + result->add_base_expression_reference(LIVE_EXPR_REF, &local_mutator); + } + else + result->add_base_expression_reference(LIVE_EXPR_REF, mutator); + // Save it in the implicit live expression references + if (implicit_live_expressions == NULL) + implicit_live_expressions = new std::vector(); + implicit_live_expressions->emplace_back(result); + // Remove the gc reference that comes back from finding it in the tree + if (result->remove_live_reference(REGION_TREE_REF)) + assert(false); // should never hit this + return result; } //-------------------------------------------------------------------------- IndexSpaceExpression* RegionTreeForest::union_index_spaces( - const std::set &exprs) + const std::set &exprs, + ReferenceMutator *mutator) //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION assert(!exprs.empty()); + for (std::set::const_iterator it = + exprs.begin(); it != exprs.end(); it++) + assert((*it)->is_valid()); #endif if (exprs.size() == 1) return *(exprs.begin()); @@ -5647,8 +5671,23 @@ namespace Legion { } if (expressions.empty()) return *(exprs.begin()); + LocalReferenceMutator local_mutator; if (expressions.size() == 1) - return expressions[0]; + { + IndexSpaceExpression *result = expressions.back(); + if (exprs.find(result) == exprs.end()) + { + if (mutator == NULL) + result->add_base_expression_reference(LIVE_EXPR_REF,&local_mutator); + else + result->add_base_expression_reference(LIVE_EXPR_REF, mutator); + if (implicit_live_expressions == NULL) + implicit_live_expressions = + new std::vector; + implicit_live_expressions->emplace_back(result); + } + return result; + } // sort them in order by their IDs std::sort(expressions.begin(), expressions.end(), compare_expressions); // remove duplicates @@ -5661,8 +5700,24 @@ namespace Legion { assert(!expressions.empty()); #endif if (expressions.size() == 1) + { + IndexSpaceExpression *result = expressions.back(); + if (exprs.find(result) == exprs.end()) + { + if (mutator == NULL) + result->add_base_expression_reference(LIVE_EXPR_REF, + &local_mutator); + else + result->add_base_expression_reference(LIVE_EXPR_REF, mutator); + if (implicit_live_expressions == NULL) + implicit_live_expressions = + new std::vector; + implicit_live_expressions->emplace_back(result); + } return expressions.back(); + } } + bool first_pass = true; // this helps make sure we don't overflow our stack while (expressions.size() > MAX_EXPRESSION_FANOUT) { @@ -5681,34 +5736,110 @@ namespace Legion { if (expressions.empty()) break; } - next_expressions.push_back(union_index_spaces(temp_expressions)); + IndexSpaceExpression *expr = union_index_spaces(temp_expressions); + if (mutator == NULL) + expr->add_base_expression_reference(REGION_TREE_REF, + &local_mutator); + else + expr->add_base_expression_reference(REGION_TREE_REF, mutator); + // Remove the gc ref that comes back from the union call + if (expr->remove_live_reference(REGION_TREE_REF)) + assert(false); // should never hit this + next_expressions.push_back(expr); } else { - next_expressions.push_back(expressions.back()); + IndexSpaceExpression *expr = expressions.back(); expressions.pop_back(); + if (mutator == NULL) + expr->add_base_expression_reference(REGION_TREE_REF, + &local_mutator); + else + expr->add_base_expression_reference(REGION_TREE_REF, mutator); + next_expressions.push_back(expr); } } + if (!first_pass) + { + // Remove the expression references on the previous set + for (std::vector::const_iterator it = + expressions.begin(); it != expressions.end(); it++) + if ((*it)->remove_base_expression_reference(REGION_TREE_REF)) + delete (*it); + } + else + first_pass = false; expressions.swap(next_expressions); // canonicalize and uniquify them all again + std::set unique_expressions; for (unsigned idx = 0; idx < expressions.size(); idx++) { - IndexSpaceExpression *&expr = expressions[idx]; - expr = expr->get_canonical_expression(this); + IndexSpaceExpression *expr = expressions[idx]; + IndexSpaceExpression *unique = expr->get_canonical_expression(this); + if (unique_expressions.insert(unique).second) + { + if (mutator == NULL) + unique->add_base_expression_reference(REGION_TREE_REF, + &local_mutator); + else + unique->add_base_expression_reference(REGION_TREE_REF, mutator); + } } - std::sort(expressions.begin(), expressions.end(), compare_expressions); - last = std::unique(expressions.begin(), expressions.end()); - if (last != expressions.end()) + // Remove the expression references + for (std::vector::const_iterator it = + expressions.begin(); it != expressions.end(); it++) + if ((*it)->remove_base_expression_reference(REGION_TREE_REF)) + delete (*it); + if (unique_expressions.size() == 1) { - expressions.erase(last, expressions.end()); -#ifdef DEBUG_LEGION - assert(!expressions.empty()); -#endif - if (expressions.size() == 1) - return expressions.back(); + IndexSpaceExpression *result = *(unique_expressions.begin()); + if (exprs.find(result) == exprs.end()) + { + if (mutator == NULL) + result->add_base_expression_reference(LIVE_EXPR_REF, + &local_mutator); + else + result->add_base_expression_reference(LIVE_EXPR_REF, mutator); + if (implicit_live_expressions == NULL) + implicit_live_expressions = + new std::vector(); + implicit_live_expressions->emplace_back(result); + } + // Remove the extra expression reference we added + if (result->remove_base_expression_reference(REGION_TREE_REF)) + assert(false); // should never hit this + return result; } + expressions.resize(unique_expressions.size()); + unsigned index = 0; + for (std::set::const_iterator + it = unique_expressions.begin(); + it != unique_expressions.end(); it++) + expressions[index++] = *it; } - return union_index_spaces(expressions); + IndexSpaceExpression *result = union_index_spaces(expressions); + if (exprs.find(result) == exprs.end()) + { + if (mutator == NULL) + result->add_base_expression_reference(LIVE_EXPR_REF,&local_mutator); + else + result->add_base_expression_reference(LIVE_EXPR_REF, mutator); + if (implicit_live_expressions == NULL) + implicit_live_expressions = new std::vector(); + implicit_live_expressions->emplace_back(result); + } + // Remove the reference added by the trie traversal + if (result->remove_live_reference(REGION_TREE_REF)) + assert(false); // should never hit this deletion + if (!first_pass) + { + // Remove the extra references on the expression vector we added + for (std::vector::const_iterator it = + expressions.begin(); it != expressions.end(); it++) + if ((*it)->remove_base_expression_reference(REGION_TREE_REF)) + delete (*it); + } + return result; } //-------------------------------------------------------------------------- @@ -5732,7 +5863,8 @@ namespace Legion { { IndexSpaceExpression *result = NULL; ExpressionTrieNode *next = NULL; - if (finder->second->find_operation(expressions, result, next)) + if (finder->second->find_operation(expressions, result, next) && + result->try_add_live_reference(REGION_TREE_REF)) return result; if (creator == NULL) { @@ -5789,7 +5921,8 @@ namespace Legion { //-------------------------------------------------------------------------- IndexSpaceExpression* RegionTreeForest::intersect_index_spaces( - IndexSpaceExpression *lhs, IndexSpaceExpression *rhs) + IndexSpaceExpression *lhs, IndexSpaceExpression *rhs, + ReferenceMutator *mutator) //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION @@ -5799,33 +5932,53 @@ namespace Legion { return lhs; if (lhs->is_empty()) return lhs; - lhs = lhs->get_canonical_expression(this); if (rhs->is_empty()) return rhs; - rhs = rhs->get_canonical_expression(this); - if (lhs == rhs) + IndexSpaceExpression *lhs_canon = lhs->get_canonical_expression(this); + IndexSpaceExpression *rhs_canon = rhs->get_canonical_expression(this); + if (lhs_canon == rhs_canon) return lhs; std::vector exprs(2); - if (compare_expressions(lhs, rhs)) + if (compare_expressions(lhs_canon, rhs_canon)) { - exprs[0] = lhs; - exprs[1] = rhs; + exprs[0] = lhs_canon; + exprs[1] = rhs_canon; } else { - exprs[0] = rhs; - exprs[1] = lhs; + exprs[0] = rhs_canon; + exprs[1] = lhs_canon; } - return intersect_index_spaces(exprs); + IndexSpaceExpression *result = intersect_index_spaces(exprs); + // Add the live reference + if (mutator == NULL) + { + LocalReferenceMutator local_mutator; + result->add_base_expression_reference(LIVE_EXPR_REF, &local_mutator); + } + else + result->add_base_expression_reference(LIVE_EXPR_REF, mutator); + // Save it in the implicit live expression references + if (implicit_live_expressions == NULL) + implicit_live_expressions = new std::vector(); + implicit_live_expressions->emplace_back(result); + // Remove the gc reference that comes back with the trie traversal + if (result->remove_live_reference(REGION_TREE_REF)) + assert(false); // should never hit this + return result; } //-------------------------------------------------------------------------- IndexSpaceExpression* RegionTreeForest::intersect_index_spaces( - const std::set &exprs) + const std::set &exprs, + ReferenceMutator *mutator) //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION assert(!exprs.empty()); + for (std::set::const_iterator it = + exprs.begin(); it != exprs.end(); it++) + assert((*it)->is_valid()); #endif if (exprs.size() == 1) return *(exprs.begin()); @@ -5844,6 +5997,7 @@ namespace Legion { // remove duplicates std::vector::iterator last = std::unique(expressions.begin(), expressions.end()); + LocalReferenceMutator local_mutator; if (last != expressions.end()) { expressions.erase(last, expressions.end()); @@ -5851,8 +6005,24 @@ namespace Legion { assert(!expressions.empty()); #endif if (expressions.size() == 1) - return expressions.back(); + { + IndexSpaceExpression *result = expressions.back(); + if (exprs.find(result) == exprs.end()) + { + if (mutator == NULL) + result->add_base_expression_reference(LIVE_EXPR_REF, + &local_mutator); + else + result->add_base_expression_reference(LIVE_EXPR_REF, mutator); + if (implicit_live_expressions == NULL) + implicit_live_expressions = + new std::vector; + implicit_live_expressions->emplace_back(result); + } + return result; + } } + bool first_pass = true; // this helps make sure we don't overflow our stack while (expressions.size() > MAX_EXPRESSION_FANOUT) { @@ -5871,37 +6041,138 @@ namespace Legion { if (expressions.empty()) break; } - next_expressions.push_back( - intersect_index_spaces(temp_expressions)); + IndexSpaceExpression *expr = + intersect_index_spaces(temp_expressions); + if (mutator == NULL) + expr->add_base_expression_reference(REGION_TREE_REF, + &local_mutator); + else + expr->add_base_expression_reference(REGION_TREE_REF, mutator); + // Remove the gc ref that comes back from the union call + if (expr->remove_live_reference(REGION_TREE_REF)) + assert(false); // should never hit this + next_expressions.push_back(expr); } else { - next_expressions.push_back(expressions.back()); + IndexSpaceExpression *expr = expressions.back(); expressions.pop_back(); + if (mutator == NULL) + expr->add_base_expression_reference(REGION_TREE_REF, + &local_mutator); + else + expr->add_base_expression_reference(REGION_TREE_REF, mutator); + next_expressions.push_back(expr); } } + if (!first_pass) + { + // Remove the expression references on the previous set + for (std::vector::const_iterator it = + expressions.begin(); it != expressions.end(); it++) + if ((*it)->remove_base_expression_reference(REGION_TREE_REF)) + delete (*it); + } + else + first_pass = false; expressions.swap(next_expressions); // canonicalize and uniquify them all again + std::set unique_expressions; for (unsigned idx = 0; idx < expressions.size(); idx++) { - IndexSpaceExpression *&expr = expressions[idx]; - if (expr->is_empty()) - return expr; - expr = expr->get_canonical_expression(this); + IndexSpaceExpression *expr = expressions[idx]; + IndexSpaceExpression *unique = expr->get_canonical_expression(this); + if (unique->is_empty()) + { + // Add a reference to the unique expression + if (exprs.find(unique) == exprs.end()) + { + if (mutator == NULL) + unique->add_base_expression_reference(LIVE_EXPR_REF, + &local_mutator); + else + unique->add_base_expression_reference(LIVE_EXPR_REF, mutator); + if (implicit_live_expressions == NULL) + implicit_live_expressions = + new std::vector; + implicit_live_expressions->emplace_back(unique); + } + // Remove references on all the things we no longer need + for (std::set:: + const_iterator it = unique_expressions.begin(); it != + unique_expressions.end(); it++) + if ((*it)->remove_base_expression_reference(REGION_TREE_REF)) + delete (*it); + for (std::vector::const_iterator it = + expressions.begin(); it != expressions.end(); it++) + if ((*it)->remove_base_expression_reference(REGION_TREE_REF)) + delete (*it); + return unique; + } + if (unique_expressions.insert(unique).second) + { + if (mutator == NULL) + unique->add_base_expression_reference(REGION_TREE_REF, + &local_mutator); + else + unique->add_base_expression_reference(REGION_TREE_REF, mutator); + } } - std::sort(expressions.begin(), expressions.end(), compare_expressions); - last = std::unique(expressions.begin(), expressions.end()); - if (last != expressions.end()) + // Remove the expression references + for (std::vector::const_iterator it = + expressions.begin(); it != expressions.end(); it++) + if ((*it)->remove_base_expression_reference(REGION_TREE_REF)) + delete (*it); + if (unique_expressions.size() == 1) { - expressions.erase(last, expressions.end()); -#ifdef DEBUG_LEGION - assert(!expressions.empty()); -#endif - if (expressions.size() == 1) - return expressions.back(); + IndexSpaceExpression *result = *(unique_expressions.begin()); + if (exprs.find(result) == exprs.end()) + { + if (mutator == NULL) + result->add_base_expression_reference(LIVE_EXPR_REF, + &local_mutator); + else + result->add_base_expression_reference(LIVE_EXPR_REF, mutator); + if (implicit_live_expressions == NULL) + implicit_live_expressions = + new std::vector(); + implicit_live_expressions->emplace_back(result); + } + // Remove the extra expression reference we added + if (result->remove_base_expression_reference(REGION_TREE_REF)) + assert(false); // should never hit this + return result; } + expressions.resize(unique_expressions.size()); + unsigned index = 0; + for (std::set::const_iterator + it = unique_expressions.begin(); + it != unique_expressions.end(); it++) + expressions[index++] = *it; + } + IndexSpaceExpression *result = intersect_index_spaces(expressions); + if (exprs.find(result) == exprs.end()) + { + if (mutator == NULL) + result->add_base_expression_reference(LIVE_EXPR_REF,&local_mutator); + else + result->add_base_expression_reference(LIVE_EXPR_REF, mutator); + if (implicit_live_expressions == NULL) + implicit_live_expressions = new std::vector; + implicit_live_expressions->emplace_back(result); + } + // Remove the reference added by the trie traversal + if (result->remove_live_reference(REGION_TREE_REF)) + assert(false); // should never hit this deletion + if (!first_pass) + { + // Remove the extra references on the expression vector we added + for (std::vector::const_iterator it = + expressions.begin(); it != expressions.end(); it++) + if ((*it)->remove_base_expression_reference(REGION_TREE_REF)) + delete (*it); } - return intersect_index_spaces(expressions); + return result; } //-------------------------------------------------------------------------- @@ -5925,7 +6196,8 @@ namespace Legion { { IndexSpaceExpression *result = NULL; ExpressionTrieNode *next = NULL; - if (finder->second->find_operation(expressions, result, next)) + if (finder->second->find_operation(expressions, result, next) && + result->try_add_live_reference(REGION_TREE_REF)) return result; if (creator == NULL) { @@ -5986,11 +6258,14 @@ namespace Legion { //-------------------------------------------------------------------------- IndexSpaceExpression* RegionTreeForest::subtract_index_spaces( IndexSpaceExpression *lhs, IndexSpaceExpression *rhs, - OperationCreator *creator/*=NULL*/) + OperationCreator *creator/*=NULL*/, + ReferenceMutator *mutator/*=NULL*/) //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION assert(lhs->type_tag == rhs->type_tag); + assert(lhs->is_valid()); + assert(rhs->is_valid()); #endif // Handle a few easy cases if (creator == NULL) @@ -6005,71 +6280,93 @@ namespace Legion { expressions[1] = rhs->get_canonical_expression(this); const IndexSpaceExprID key = expressions[0]->expr_id; // See if we can find it in read-only mode + IndexSpaceExpression *result = NULL; { AutoLock l_lock(lookup_is_op_lock,1,false/*exclusive*/); std::map::const_iterator finder = difference_ops.find(key); if (finder != difference_ops.end()) { - IndexSpaceExpression *result = NULL; + IndexSpaceExpression *expr = NULL; ExpressionTrieNode *next = NULL; - if (finder->second->find_operation(expressions, result, next)) - return result; - if (creator == NULL) + if (finder->second->find_operation(expressions, expr, next) && + expr->try_add_live_reference(REGION_TREE_REF)) + result = expr; + if (result == NULL) { - DifferenceOpCreator diff_creator(this, lhs->type_tag, - expressions[0], expressions[1]); - return next->find_or_create_operation(expressions, diff_creator); + if (creator == NULL) + { + DifferenceOpCreator diff_creator(this, lhs->type_tag, + expressions[0], expressions[1]); + result = next->find_or_create_operation(expressions,diff_creator); + } + else + result = next->find_or_create_operation(expressions, *creator); } - else - return next->find_or_create_operation(expressions, *creator); } } - ExpressionTrieNode *node = NULL; - if (creator == NULL) + if (result == NULL) { - DifferenceOpCreator diff_creator(this, lhs->type_tag, - expressions[0], expressions[1]); - // Didn't find it, retake the lock, see if we lost the race - // and if not make the actual trie node - AutoLock l_lock(lookup_is_op_lock); - // See if we lost the race - std::map::const_iterator - finder = difference_ops.find(key); - if (finder == difference_ops.end()) + ExpressionTrieNode *node = NULL; + if (creator == NULL) { - // Didn't lose the race so make the node - node = new ExpressionTrieNode(0/*depth*/, expressions[0]->expr_id); - difference_ops[key] = node; - } - else - node = finder->second; + DifferenceOpCreator diff_creator(this, lhs->type_tag, + expressions[0], expressions[1]); + // Didn't find it, retake the lock, see if we lost the race + // and if not make the actual trie node + AutoLock l_lock(lookup_is_op_lock); + // See if we lost the race + std::map::const_iterator + finder = difference_ops.find(key); + if (finder == difference_ops.end()) + { + // Didn't lose the race so make the node + node = new ExpressionTrieNode(0/*depth*/, expressions[0]->expr_id); + difference_ops[key] = node; + } + else + node = finder->second; #ifdef DEBUG_LEGION - assert(node != NULL); + assert(node != NULL); #endif - return node->find_or_create_operation(expressions, diff_creator); - } - else - { - // Didn't find it, retake the lock, see if we lost the race - // and if not make the actual trie node - AutoLock l_lock(lookup_is_op_lock); - // See if we lost the race - std::map::const_iterator - finder = difference_ops.find(key); - if (finder == difference_ops.end()) - { - // Didn't lose the race so make the node - node = new ExpressionTrieNode(0/*depth*/, expressions[0]->expr_id); - difference_ops[key] = node; + result = node->find_or_create_operation(expressions, diff_creator); } else - node = finder->second; + { + // Didn't find it, retake the lock, see if we lost the race + // and if not make the actual trie node + AutoLock l_lock(lookup_is_op_lock); + // See if we lost the race + std::map::const_iterator + finder = difference_ops.find(key); + if (finder == difference_ops.end()) + { + // Didn't lose the race so make the node + node = new ExpressionTrieNode(0/*depth*/, expressions[0]->expr_id); + difference_ops[key] = node; + } + else + node = finder->second; #ifdef DEBUG_LEGION - assert(node != NULL); + assert(node != NULL); #endif - return node->find_or_create_operation(expressions, *creator); + result = node->find_or_create_operation(expressions, *creator); + } } + if (mutator == NULL) + { + LocalReferenceMutator local_mutator; + result->add_base_expression_reference(LIVE_EXPR_REF, &local_mutator); + } + else + result->add_base_expression_reference(LIVE_EXPR_REF, mutator); + if (implicit_live_expressions == NULL) + implicit_live_expressions = new std::vector; + implicit_live_expressions->emplace_back(result); + // Remove the gc reference that comes back from finding it in the tree + if (result->remove_live_reference(REGION_TREE_REF)) + assert(false); // should never hit this + return result; } //-------------------------------------------------------------------------- @@ -6108,37 +6405,27 @@ namespace Legion { } //-------------------------------------------------------------------------- - void RegionTreeForest::invalidate_index_space_expression( - const std::vector &parents) + void RegionTreeForest::invalidate_index_space_operations( + const std::vector &derived) //-------------------------------------------------------------------------- { // Two phases here: in read-only made figure out the set of operations // we are going to invalidate but don't remove them yet - std::deque to_remove; - { - AutoLock l_lock(lookup_is_op_lock,1,false/*exclusive*/); - for (std::vector::const_iterator it = - parents.begin(); it != parents.end(); it++) - (*it)->invalidate_operation(to_remove); - } - if (to_remove.empty()) - return; - // Now retake the lock and do the removal - std::deque to_delete; + std::vector invalidated; + invalidated.reserve(derived.size()); { AutoLock l_lock(lookup_is_op_lock); - for (std::deque::const_iterator it = - to_remove.begin(); it != to_remove.end(); it++) + for (std::vector::const_iterator it = + derived.begin(); it != derived.end(); it++) { - if ((*it)->remove_operation(this)) - to_delete.push_back(*it); + if ((*it)->invalidate_operation()) + invalidated.push_back(*it); } } - if (to_delete.empty()) - return; - for (std::deque::const_iterator it = - to_delete.begin(); it != to_delete.end(); it++) - delete (*it); + for (std::vector::const_iterator it = + invalidated.begin(); it != invalidated.end(); it++) + if ((*it)->remove_base_gc_ref(REGION_TREE_REF)) + delete (*it); } //-------------------------------------------------------------------------- @@ -6375,8 +6662,14 @@ namespace Legion { result->send_remote_gc_decrement(source); return result; } - RemoteExpressionCreator creator(this, derez); - return creator.consume(); + TypeTag type_tag; + derez.deserialize(type_tag); + RemoteExpressionCreator creator(this, type_tag, derez); + NT_TemplateHelper::demux(type_tag, &creator); +#ifdef DEBUG_LEGION + assert(creator.operation != NULL); +#endif + return creator.operation; } ///////////////////////////////////////////////////////////// @@ -6415,7 +6708,7 @@ namespace Legion { //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION - assert(parent_operations.empty()); + assert(derived_operations.empty()); #endif } @@ -6426,7 +6719,7 @@ namespace Legion { { const TightenIndexSpaceArgs *targs = (const TightenIndexSpaceArgs*)args; targs->proxy_this->tighten_index_space(); - if (targs->proxy_this->remove_base_expression_reference(IS_EXPR_REF)) + if (targs->proxy_dc->remove_base_resource_ref(META_TASK_REF)) delete targs->proxy_this; } @@ -6439,25 +6732,56 @@ namespace Legion { } //-------------------------------------------------------------------------- - void IndexSpaceExpression::add_parent_operation(IndexSpaceOperation *op) + void IndexSpaceExpression::add_derived_operation(IndexSpaceOperation *op) //-------------------------------------------------------------------------- { AutoLock e_lock(expr_lock); #ifdef DEBUG_LEGION - assert(parent_operations.find(op) == parent_operations.end()); + assert(derived_operations.find(op) == derived_operations.end()); #endif - parent_operations.insert(op); + derived_operations.insert(op); } //-------------------------------------------------------------------------- - void IndexSpaceExpression::remove_parent_operation(IndexSpaceOperation *op) + void IndexSpaceExpression::remove_derived_operation(IndexSpaceOperation *op) //-------------------------------------------------------------------------- { AutoLock e_lock(expr_lock); #ifdef DEBUG_LEGION - assert(parent_operations.find(op) != parent_operations.end()); + assert(derived_operations.find(op) != derived_operations.end()); #endif - parent_operations.erase(op); + derived_operations.erase(op); + } + + //-------------------------------------------------------------------------- + void IndexSpaceExpression::invalidate_derived_operations(DistributedID did, + RegionTreeForest *context) + //-------------------------------------------------------------------------- + { + // Traverse upwards for any derived operations and invalidate them + std::vector derived; + { + AutoLock e_lock(expr_lock,1,false/*exclusive*/); + if (!derived_operations.empty()) + { + derived.reserve(derived_operations.size()); + for (std::set::const_iterator it = + derived_operations.begin(); it != derived_operations.end(); it++) + { + (*it)->add_tree_expression_reference(did); + derived.push_back(*it); + } + } + } + if (!derived.empty()) + { + context->invalidate_index_space_operations(derived); + // Remove any references that we have on the parents + for (std::vector::const_iterator it = + derived.begin(); it != derived.end(); it++) + if ((*it)->remove_tree_expression_reference(did)) + delete (*it); + } } //-------------------------------------------------------------------------- @@ -6479,8 +6803,11 @@ namespace Legion { // If the canonical expression is not ourself, then the region tree // forest has given us a reference back on it, see if we're the first // ones to write it, if not we can remove the reference now + const DistributedID did = get_distributed_id(); if (!__sync_bool_compare_and_swap(&canonical, NULL, expr)) - expr->remove_canonical_reference(get_distributed_id()); + expr->remove_canonical_reference(did); + else // We're the first so add our resource reference + expr->add_tree_expression_reference(did); return expr; } @@ -6545,23 +6872,6 @@ namespace Legion { origin, &wait_for); } - //-------------------------------------------------------------------------- - /*static*/ void IndexSpaceExpression::finalize_canonical(size_t volume, - RegionTreeForest *forest, - IndexSpaceExpression *original, - IndexSpaceExpression *canonical) - //-------------------------------------------------------------------------- - { -#ifdef DEBUG_LEGION - assert(canonical != NULL); -#endif - if (canonical == original) - forest->remove_canonical_expression(canonical, volume); - else if (canonical->remove_canonical_reference( - original->get_distributed_id())) - delete canonical; - } - ///////////////////////////////////////////////////////////// // Index Space Operation ///////////////////////////////////////////////////////////// @@ -6574,10 +6884,11 @@ namespace Legion { ctx->runtime->get_available_distributed_id(), ctx->runtime->address_space), context(ctx), origin_expr(this), op_kind(kind), invalidated(0) +#ifdef DEBUG_LEGION + , tree_active(true) +#endif //-------------------------------------------------------------------------- { - // We always keep a reference on ourself until we get invalidated - add_base_resource_ref(IS_EXPR_REF); #ifdef LEGION_GC log_garbage.info("GC Index Expr %lld %d %lld", LEGION_DISTRIBUTED_ID_FILTER(this->did), local_space, expr_id); @@ -6592,11 +6903,17 @@ namespace Legion { DistributedCollectable(ctx->runtime, did, owner), context(ctx), origin_expr(origin), op_kind(REMOTE_EXPRESSION_KIND), invalidated(0) +#ifdef DEBUG_LEGION + , tree_active(true) +#endif //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION assert(!is_owner()); #endif + // Keep a gc reference to ensure that this remain active until it + // is made invalid on the owner node + add_base_gc_ref(REMOTE_DID_REF); #ifdef LEGION_GC log_garbage.info("GC Index Expr %lld %d %lld", LEGION_DISTRIBUTED_ID_FILTER(this->did), local_space, expr_id); @@ -6607,47 +6924,121 @@ namespace Legion { IndexSpaceOperation::~IndexSpaceOperation(void) //-------------------------------------------------------------------------- { - if (canonical != NULL) - { + } + + //-------------------------------------------------------------------------- + void IndexSpaceOperation::notify_active(ReferenceMutator *mutator) + //-------------------------------------------------------------------------- + { #ifdef DEBUG_LEGION - assert(has_volume); + assert(tree_active); // should only happen once #endif - IndexSpaceExpression::finalize_canonical(volume,context,this,canonical); - } + } + + //-------------------------------------------------------------------------- + void IndexSpaceOperation::notify_valid(ReferenceMutator *mutator) + //-------------------------------------------------------------------------- + { + // If we're not the owner send a valid reference to the owner if (!is_owner()) - context->unregister_remote_expression(expr_id); + send_remote_valid_increment(owner_space, mutator); + if ((canonical != NULL) && (canonical != this) && + !canonical->try_add_canonical_reference(did)) + { + // We were unsuccessful at adding our canonical reference so + // remove the resource reference to the canonical object and + // and mark that we no longer have a canonical expression + if (canonical->remove_tree_expression_reference(did)) + delete canonical; + canonical = NULL; + } } //-------------------------------------------------------------------------- - void IndexSpaceOperation::notify_active(ReferenceMutator *mutator) + void IndexSpaceOperation::notify_invalid(ReferenceMutator *mutator) //-------------------------------------------------------------------------- { - // If we're not the owner send a gc reference to the owner if (!is_owner()) - send_remote_gc_increment(owner_space, mutator); + send_remote_valid_decrement(owner_space, mutator); + // If we have a canonical reference that is not ourselves then + // we need to remove the nested reference that we are holding on it too + if ((canonical != NULL) && (canonical != this) && + canonical->remove_canonical_reference(did)) + // should never actually delete it since we have a resource ref too + assert(false); + } + + //-------------------------------------------------------------------------- + void IndexSpaceOperation::InactiveFunctor::apply(AddressSpaceID target) + //-------------------------------------------------------------------------- + { + op->send_remote_gc_decrement(target, mutator); } //-------------------------------------------------------------------------- void IndexSpaceOperation::notify_inactive(ReferenceMutator *mutator) //-------------------------------------------------------------------------- { - // Remove the remote gc reference on the owner - if (!is_owner()) - send_remote_gc_decrement(owner_space, mutator); +#ifdef DEBUG_LEGION + // Should only go through one cycle of active to not active + assert(tree_active); + tree_active = false; +#endif + if (is_owner()) + { + // Send the removal of the gc references on any remote nodes + if (has_remote_instances()) + { + InactiveFunctor functor(this, mutator); + map_over_remote_instances(functor); + } + } + else + context->unregister_remote_expression(expr_id); + // Invalidate any derived operations + invalidate_derived_operations(did, context); + // Remove this operation from the region tree + remove_operation(); + if (canonical != NULL) + { + if (canonical == this) + { +#ifdef DEBUG_LEGION + assert(has_volume); +#endif + context->remove_canonical_expression(this, volume); + } + else if (canonical->remove_tree_expression_reference(did)) + delete canonical; + } } //-------------------------------------------------------------------------- bool IndexSpaceOperation::try_add_canonical_reference(DistributedID source) //-------------------------------------------------------------------------- { - return check_resource_and_increment(source); + return check_gc_and_increment(source); } //-------------------------------------------------------------------------- bool IndexSpaceOperation::remove_canonical_reference(DistributedID source) //-------------------------------------------------------------------------- { - return remove_nested_resource_ref(source); + return remove_nested_gc_ref(source); + } + + //-------------------------------------------------------------------------- + bool IndexSpaceOperation::try_add_live_reference(ReferenceSource source) + //-------------------------------------------------------------------------- + { + return check_gc_and_increment(source); + } + + //-------------------------------------------------------------------------- + bool IndexSpaceOperation::remove_live_reference(ReferenceSource source) + //-------------------------------------------------------------------------- + { + return remove_base_gc_ref(source); } //-------------------------------------------------------------------------- @@ -6658,10 +7049,10 @@ namespace Legion { if (mutator == NULL) { LocalReferenceMutator local_mutator; - add_base_gc_ref(source, &local_mutator, count); + add_base_valid_ref(source, &local_mutator, count); } else - add_base_gc_ref(source, mutator, count); + add_base_valid_ref(source, mutator, count); } //-------------------------------------------------------------------------- @@ -6681,10 +7072,10 @@ namespace Legion { if (mutator == NULL) { LocalReferenceMutator local_mutator; - add_nested_gc_ref(source, &local_mutator, count); + add_nested_valid_ref(source, &local_mutator, count); } else - add_nested_gc_ref(source, mutator, count); + add_nested_valid_ref(source, mutator, count); } //-------------------------------------------------------------------------- @@ -6692,7 +7083,7 @@ namespace Legion { ReferenceSource source, unsigned count) //-------------------------------------------------------------------------- { - return remove_base_gc_ref(source, NULL/*mutator*/, count); + return remove_base_valid_ref(source, NULL/*mutator*/, count); } //-------------------------------------------------------------------------- @@ -6700,7 +7091,7 @@ namespace Legion { DistributedID source, unsigned count) //-------------------------------------------------------------------------- { - return remove_nested_gc_ref(source, NULL/*mutator*/, count); + return remove_nested_valid_ref(source, NULL/*mutator*/, count); } //-------------------------------------------------------------------------- @@ -6719,57 +7110,13 @@ namespace Legion { return remove_nested_resource_ref(id, count); } - //-------------------------------------------------------------------------- - void IndexSpaceOperation::invalidate_operation( - std::deque &to_remove) - //-------------------------------------------------------------------------- - { - // See if we're the first one here, there can be a race with - // multiple invalidations occurring at the same time - if (__sync_fetch_and_add(&invalidated, 1) > 0) - return; - // Add ourselves to the list if we're here first - to_remove.push_back(this); - // The expression that we added in the constructor flows back in - // the 'to_remove' data structure - std::vector parents; - { - // Have to get a read-only copy of these while holding the lock - AutoLock i_lock(inter_lock,1,false/*exclusive*/); - // If we don't have any parent operations then we're done - if (parent_operations.empty()) - return; - parents.resize(parent_operations.size()); - unsigned idx = 0; - for (std::set::const_iterator it = - parent_operations.begin(); it != - parent_operations.end(); it++, idx++) - { - // Add a reference to prevent the parents from being collected - // as we're traversing up the tree - (*it)->add_tree_expression_reference(did); - parents[idx] = (*it); - } - } - // Now continue up the tree with the parents which we are temporarily - // holding a reference to in order to prevent a collection race - for (std::vector::const_iterator it = - parents.begin(); it != parents.end(); it++) - { - (*it)->invalidate_operation(to_remove); - // Remove the reference when we're done with the parents - if ((*it)->remove_tree_expression_reference(did)) - delete (*it); - } - } - ///////////////////////////////////////////////////////////// // Operation Creator ///////////////////////////////////////////////////////////// //-------------------------------------------------------------------------- - OperationCreator::OperationCreator(void) - : result(NULL) + OperationCreator::OperationCreator(RegionTreeForest *f) + : forest(f), result(NULL) //-------------------------------------------------------------------------- { } @@ -6779,11 +7126,8 @@ namespace Legion { //-------------------------------------------------------------------------- { // If we still have a result then it's because it wasn't consumed need - // we need to remove it's reference that was added by the - // IndexSpaceOperation constructor - // We know the operation was never added to the region tree so we - // can pass in a NULL pointer to the region tree forest - if ((result != NULL) && result->remove_operation(NULL/*forest*/)) + // we need to remove it's reference that was added by the constructor + if ((result != NULL) && result->remove_base_resource_ref(REGION_TREE_REF)) delete result; } @@ -6806,9 +7150,11 @@ namespace Legion { #ifdef DEBUG_LEGION assert(result != NULL); #endif - IndexSpaceExpression *temp = result; - result = NULL; - return temp; + // Add an expression reference here since this is going to be put + // into the region tree expression trie data structure, the reference + // will be removed when the expressions is removed from the trie + result->add_base_gc_ref(REGION_TREE_REF); + return result; } ///////////////////////////////////////////////////////////// @@ -6981,13 +7327,17 @@ namespace Legion { { // We're the node that should have the operation // Check to see if we've made the operation yet - if (local_operation != NULL) + if ((local_operation != NULL) && + local_operation->try_add_live_reference(REGION_TREE_REF)) return local_operation; // Operation doesn't exist yet, retake the lock and try to make it AutoLock t_lock(trie_lock); - if (local_operation != NULL) + if ((local_operation != NULL) && + local_operation->try_add_live_reference(REGION_TREE_REF)) return local_operation; local_operation = creator.consume(); + if (!local_operation->try_add_live_reference(REGION_TREE_REF)) + assert(false); // should never hit this return local_operation; } else if (expressions.size() == (depth+2)) @@ -7001,7 +7351,8 @@ namespace Legion { AutoLock t_lock(trie_lock,1,false/*exclusive*/); std::map::const_iterator op_finder = operations.find(target_expr); - if (op_finder != operations.end()) + if ((op_finder != operations.end()) && + op_finder->second->try_add_live_reference(REGION_TREE_REF)) return op_finder->second; std::map::const_iterator node_finder = nodes.find(target_expr); @@ -7015,7 +7366,8 @@ namespace Legion { AutoLock t_lock(trie_lock); std::map::const_iterator op_finder = operations.find(target_expr); - if (op_finder != operations.end()) + if ((op_finder != operations.end()) && + op_finder->second->try_add_live_reference(REGION_TREE_REF)) return op_finder->second; // Still don't have the op std::map::const_iterator @@ -7025,6 +7377,8 @@ namespace Legion { // Didn't find the sub-node, so make the operation here IndexSpaceExpression *result = creator.consume(); operations[target_expr] = result; + if (!result->try_add_live_reference(REGION_TREE_REF)) + assert(false); // should never hit this return result; } else @@ -7383,6 +7737,9 @@ namespace Legion { realm_index_space_set(Runtime::create_rt_user_event()), tight_index_space_set(Runtime::create_rt_user_event()), tight_index_space(false), tree_valid(is_owner()) +#ifdef DEBUG_LEGION + , tree_active(true) +#endif //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION @@ -7417,13 +7774,6 @@ namespace Legion { IndexSpaceNode::~IndexSpaceNode(void) //-------------------------------------------------------------------------- { - if (canonical != NULL) - { -#ifdef DEBUG_LEGION - assert(has_volume); -#endif - IndexSpaceExpression::finalize_canonical(volume,context,this,canonical); - } // Remove ourselves from the context if (registered_with_runtime) context->remove_node(handle); @@ -7444,19 +7794,56 @@ namespace Legion { } //-------------------------------------------------------------------------- - void IndexSpaceNode::notify_valid(ReferenceMutator *mutator) + void IndexSpaceNode::notify_active(ReferenceMutator *mutator) //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION - assert(tree_valid || !is_owner()); + assert(tree_active); #endif + } + + //-------------------------------------------------------------------------- + void IndexSpaceNode::notify_valid(ReferenceMutator *mutator) + //-------------------------------------------------------------------------- + { // If we're not the owner, we add a valid reference to the owner if (!is_owner()) send_remote_valid_increment(owner_space, mutator); + if ((canonical != NULL) && (canonical != this) && + !canonical->try_add_canonical_reference(did)) + { + // We were unsuccessful at adding our canonical reference so + // remove the resource reference to the canonical object and + // and mark that we no longer have a canonical expression + if (canonical->remove_tree_expression_reference(did)) + delete canonical; + canonical = NULL; + } + } + + //-------------------------------------------------------------------------- + void IndexSpaceNode::notify_invalid(ReferenceMutator *mutator) + //-------------------------------------------------------------------------- + { + if (is_owner()) + { + AutoLock n_lock(node_lock); + // First time we become invalid then the tree is no longer valid + // Any later valid states are just for expression references + tree_valid = false; + } + else + send_remote_valid_decrement(owner_space, mutator); + // If we have a canonical reference that is not ourselves then + // we need to remove the nested reference that we are holding on it too + if ((canonical != NULL) && (canonical != this) && + canonical->remove_canonical_reference(did)) + // should never actually delete it since we have a resource ref too + assert(false); } //-------------------------------------------------------------------------- - void IndexSpaceNode::InvalidFunctor::apply(AddressSpaceID target) + void IndexSpaceNode::InactiveFunctor::apply(AddressSpaceID target) //-------------------------------------------------------------------------- { std::map::iterator finder = @@ -7469,57 +7856,38 @@ namespace Legion { } //-------------------------------------------------------------------------- - void IndexSpaceNode::notify_invalid(ReferenceMutator *mutator) + void IndexSpaceNode::notify_inactive(ReferenceMutator *mutator) //-------------------------------------------------------------------------- { - if (is_owner()) - { - AutoLock n_lock(node_lock); #ifdef DEBUG_LEGION - assert(tree_valid); + // Should only become active one time + assert(tree_active); + tree_active = false; #endif - tree_valid = false; - // Send the invalidations - if ((send_references == 0) && has_remote_instances()) + if (is_owner()) + { + // Send the removal of the gc references on any remote nodes + if (has_remote_instances()) { - // Make sure invalidation are not handled before send effects - InvalidFunctor functor(this, mutator, send_effects); + InactiveFunctor functor(this, mutator, send_effects); map_over_remote_instances(functor); } } else - send_remote_valid_decrement(owner_space, mutator); - } - - //-------------------------------------------------------------------------- - void IndexSpaceNode::notify_inactive(ReferenceMutator *mutator) - //-------------------------------------------------------------------------- - { - // Traverse upwards for any parent operations and invalidate them - std::vector parents; + context->unregister_remote_expression(expr_id); + // Invalidate any derived operations + invalidate_derived_operations(did, context); + if (canonical != NULL) { - AutoLock n_lock(node_lock,1,false/*exclusive*/); - if (!parent_operations.empty()) + if (canonical == this) { - parents.resize(parent_operations.size()); - unsigned idx = 0; - for (std::set::const_iterator it = - parent_operations.begin(); it != - parent_operations.end(); it++, idx++) - { - (*it)->add_tree_expression_reference(did); - parents[idx] = (*it); - } +#ifdef DEBUG_LEGION + assert(has_volume); +#endif + context->remove_canonical_expression(this, volume); } - } - if (!parents.empty()) - { - context->invalidate_index_space_expression(parents); - // Remove any references that we have on the parents - for (std::vector::const_iterator it = - parents.begin(); it != parents.end(); it++) - if ((*it)->remove_tree_expression_reference(did)) - delete (*it); + else if (canonical->remove_tree_expression_reference(did)) + delete canonical; } } @@ -8216,13 +8584,6 @@ namespace Legion { // Remove this from the remote instances since we did // not actually end up sending it filter_remote_instances(target); - // Send the invalidations - if (remove_reference && !tree_valid && has_remote_instances()) - { - // Make sure invalidation are not handled before send effects - InvalidFunctor functor(this, &mutator, send_effects); - map_over_remote_instances(functor); - } } if (remove_reference && parent->remove_nested_resource_ref(did)) delete parent; @@ -8285,13 +8646,6 @@ namespace Legion { assert(send_references > 0); #endif remove_reference = (--send_references == 0); - // Send the invalidations - if (remove_reference && !tree_valid && has_remote_instances()) - { - // Make sure invalidation are not handled before send effects - InvalidFunctor functor(this, &mutator, send_effects); - map_over_remote_instances(functor); - } } } if (remove_reference && parent->remove_nested_resource_ref(did)) @@ -8307,14 +8661,6 @@ namespace Legion { { AutoLock n_lock(node_lock); remove_reference = (--send_references == 0); - // Send the invalidations - if (is_owner() && remove_reference && - !tree_valid && has_remote_instances()) - { - // Make sure invalidation are not handled before send effects - InvalidFunctor functor(this, &mutator, send_effects); - map_over_remote_instances(functor); - } } if (remove_reference && parent->remove_nested_resource_ref(did)) delete parent; @@ -8683,14 +9029,28 @@ namespace Legion { bool IndexSpaceNode::try_add_canonical_reference(DistributedID source) //-------------------------------------------------------------------------- { - return check_resource_and_increment(source); + return check_gc_and_increment(source); } //-------------------------------------------------------------------------- bool IndexSpaceNode::remove_canonical_reference(DistributedID source) //-------------------------------------------------------------------------- { - return remove_nested_resource_ref(source); + return remove_nested_gc_ref(source); + } + + //-------------------------------------------------------------------------- + bool IndexSpaceNode::try_add_live_reference(ReferenceSource source) + //-------------------------------------------------------------------------- + { + return check_gc_and_increment(source); + } + + //-------------------------------------------------------------------------- + bool IndexSpaceNode::remove_live_reference(ReferenceSource source) + //-------------------------------------------------------------------------- + { + return remove_base_gc_ref(source); } //-------------------------------------------------------------------------- @@ -8701,10 +9061,10 @@ namespace Legion { if (mutator == NULL) { LocalReferenceMutator local_mutator; - add_base_gc_ref(source, &local_mutator, count); + add_base_valid_ref(source, &local_mutator, count); } else - add_base_gc_ref(source, mutator, count); + add_base_valid_ref(source, mutator, count); } //-------------------------------------------------------------------------- @@ -8724,10 +9084,10 @@ namespace Legion { if (mutator == NULL) { LocalReferenceMutator local_mutator; - add_nested_gc_ref(source, &local_mutator, count); + add_nested_valid_ref(source, &local_mutator, count); } else - add_nested_gc_ref(source, mutator, count); + add_nested_valid_ref(source, mutator, count); } //-------------------------------------------------------------------------- @@ -8735,7 +9095,7 @@ namespace Legion { ReferenceSource source, unsigned count) //-------------------------------------------------------------------------- { - return remove_base_gc_ref(source, NULL/*mutator*/, count); + return remove_base_valid_ref(source, NULL/*mutator*/, count); } //-------------------------------------------------------------------------- @@ -8743,7 +9103,7 @@ namespace Legion { DistributedID source, unsigned count) //-------------------------------------------------------------------------- { - return remove_nested_gc_ref(source, NULL/*mutator*/, count); + return remove_nested_valid_ref(source, NULL/*mutator*/, count); } //-------------------------------------------------------------------------- @@ -8762,13 +9122,6 @@ namespace Legion { return remove_nested_resource_ref(id, count); } - //-------------------------------------------------------------------------- - bool IndexSpaceNode::remove_operation(RegionTreeForest *forest) - //-------------------------------------------------------------------------- - { - return remove_base_resource_ref(IS_EXPR_REF); - } - //-------------------------------------------------------------------------- bool IndexSpaceNode::intersects_with(IndexSpaceNode *rhs, bool compute) //-------------------------------------------------------------------------- @@ -8946,7 +9299,10 @@ namespace Legion { parent->add_nested_resource_ref(did); color_space->add_nested_resource_ref(did); if (has_complete && complete) + { + parent->add_nested_expression_reference(did); union_expr.store(parent); + } else union_expr.store(NULL); #ifdef DEBUG_LEGION @@ -8981,7 +9337,10 @@ namespace Legion { parent->add_nested_resource_ref(did); color_space->add_nested_resource_ref(did); if (has_complete && complete) + { + parent->add_nested_expression_reference(did); union_expr.store(parent); + } else union_expr.store(NULL); #ifdef DEBUG_LEGION @@ -9147,6 +9506,10 @@ namespace Legion { delete (*it); partition_trackers.clear(); } + // Remove the reference on our union expression if we have one + IndexSpaceExpression *expr = union_expr.load(); + if ((expr != NULL) && expr->remove_nested_expression_reference(did)) + delete expr; } //-------------------------------------------------------------------------- @@ -9847,10 +10210,15 @@ namespace Legion { // We can always write the result immediately since we know // that the common sub-expression code will give the same // result if there is a race - union_expr.store(context->union_index_spaces(child_spaces)); + IndexSpaceExpression *expr = context->union_index_spaces(child_spaces); + expr->add_nested_expression_reference(did); + union_expr.store(expr); } else // if we're complete the parent is our expression + { + parent->add_nested_expression_reference(did); union_expr.store(parent); + } return union_expr.load(); } diff --git a/runtime/legion/region_tree.h b/runtime/legion/region_tree.h index a12b3cde8e..ddd6e93b3b 100644 --- a/runtime/legion/region_tree.h +++ b/runtime/legion/region_tree.h @@ -68,13 +68,15 @@ namespace Legion { */ class OperationCreator { public: - OperationCreator(void); + OperationCreator(RegionTreeForest *f); virtual ~OperationCreator(void); public: void produce(IndexSpaceOperation *op); IndexSpaceExpression* consume(void); public: virtual void create_operation(void) = 0; + public: + RegionTreeForest *const forest; protected: IndexSpaceOperation *result; }; @@ -813,10 +815,19 @@ namespace Legion { // if these operations have been requested before and if so will // return the common sub-expression, if not we will actually do // the computation and memoize it for the future + // + // Note that you do not need to worry about reference counting + // expressions returned from these methods inside of tasks because + // we implicitly add references to them and store them in the + // implicit_live_expression data structure and then remove the + // references after the meta-task or runtime call is done executing. + IndexSpaceExpression* union_index_spaces(IndexSpaceExpression *lhs, - IndexSpaceExpression *rhs); + IndexSpaceExpression *rhs, + ReferenceMutator *mutator = NULL); IndexSpaceExpression* union_index_spaces( - const std::set &exprs); + const std::set &exprs, + ReferenceMutator *mutator = NULL); protected: // Internal version IndexSpaceExpression* union_index_spaces( @@ -824,27 +835,36 @@ namespace Legion { OperationCreator *creator = NULL); public: IndexSpaceExpression* intersect_index_spaces( - IndexSpaceExpression *lhs, - IndexSpaceExpression *rhs); + IndexSpaceExpression *lhs, + IndexSpaceExpression *rhs, + ReferenceMutator *mutator = NULL); IndexSpaceExpression* intersect_index_spaces( - const std::set &exprs); + const std::set &exprs, + ReferenceMutator *mutator = NULL); protected: IndexSpaceExpression* intersect_index_spaces( const std::vector &exprs, OperationCreator *creator = NULL); public: IndexSpaceExpression* subtract_index_spaces(IndexSpaceExpression *lhs, - IndexSpaceExpression *rhs, OperationCreator *creator = NULL); + IndexSpaceExpression *rhs, OperationCreator *creator = NULL, + ReferenceMutator *mutator = NULL); public: IndexSpaceExpression* find_canonical_expression(IndexSpaceExpression *ex); void remove_canonical_expression(IndexSpaceExpression *expr, size_t vol); private: static inline bool compare_expressions(IndexSpaceExpression *one, IndexSpaceExpression *two); + struct CompareExpressions { + public: + inline bool operator()(IndexSpaceExpression *one, + IndexSpaceExpression *two) const + { return compare_expressions(one, two); } + }; public: // Methods for removing index space expression when they are done - void invalidate_index_space_expression( - const std::vector &parents); + void invalidate_index_space_operations( + const std::vector &derived); void remove_union_operation(IndexSpaceOperation *expr, const std::vector &exprs); void remove_intersection_operation(IndexSpaceOperation *expr, @@ -1025,12 +1045,14 @@ namespace Legion { static const LgTaskID TASK_ID = LG_TIGHTEN_INDEX_SPACE_TASK_ID; public: - TightenIndexSpaceArgs(IndexSpaceExpression *proxy) + TightenIndexSpaceArgs(IndexSpaceExpression *proxy, + DistributedCollectable *dc) : LgTaskArgs(implicit_provenance), - proxy_this(proxy) - { proxy->add_base_expression_reference(IS_EXPR_REF); } + proxy_this(proxy), proxy_dc(dc) + { proxy_dc->add_base_resource_ref(META_TASK_REF); } public: IndexSpaceExpression *const proxy_this; + DistributedCollectable *const proxy_dc; }; public: template @@ -1092,9 +1114,14 @@ namespace Legion { virtual void pack_expression_value(Serializer &rez, AddressSpaceID target) = 0; public: +#ifdef DEBUG_LEGION + virtual bool is_valid(void) const = 0; +#endif virtual DistributedID get_distributed_id(void) const = 0; virtual bool try_add_canonical_reference(DistributedID source) = 0; virtual bool remove_canonical_reference(DistributedID source) = 0; + virtual bool try_add_live_reference(ReferenceSource source) = 0; + virtual bool remove_live_reference(ReferenceSource source) = 0; virtual void add_base_expression_reference(ReferenceSource source, ReferenceMutator *mutator = NULL, unsigned count = 1) = 0; virtual void add_nested_expression_reference(DistributedID source, @@ -1110,7 +1137,6 @@ namespace Legion { virtual bool remove_tree_expression_reference(DistributedID source, unsigned count = 1) = 0; public: - virtual bool remove_operation(RegionTreeForest *forest) = 0; virtual IndexSpaceNode* create_node(IndexSpace handle, DistributedID did, RtEvent initialized, std::set *applied) = 0; @@ -1177,6 +1203,7 @@ namespace Legion { bool compact,LayoutConstraintKind *unsat_kind = NULL, unsigned *unsat_index = NULL,void **piece_list =NULL, size_t *piece_list_size = NULL) = 0; + // Return the expression with a resource ref on the expression virtual IndexSpaceExpression* create_layout_expression( const void *piece_list, size_t piece_list_size) = 0; virtual bool meets_layout_expression(IndexSpaceExpression *expr, @@ -1188,8 +1215,10 @@ namespace Legion { static void handle_tighten_index_space(const void *args); static AddressSpaceID get_owner_space(IndexSpaceExprID id, Runtime *rt); public: - void add_parent_operation(IndexSpaceOperation *op); - void remove_parent_operation(IndexSpaceOperation *op); + void add_derived_operation(IndexSpaceOperation *op); + void remove_derived_operation(IndexSpaceOperation *op); + void invalidate_derived_operations(DistributedID did, + RegionTreeForest *context); public: inline bool is_empty(void) { @@ -1304,22 +1333,85 @@ namespace Legion { RegionTreeForest *forest, AddressSpaceID source, bool &is_local,bool &is_index_space,IndexSpace &handle, IndexSpaceExprID &remote_expr_id, RtEvent &wait_for); - static void finalize_canonical(size_t volume, RegionTreeForest *forest, - IndexSpaceExpression *original, - IndexSpaceExpression *canonical); public: const TypeTag type_tag; const IndexSpaceExprID expr_id; private: LocalLock &expr_lock; protected: - std::set parent_operations; + std::set derived_operations; IndexSpaceExpression *canonical; size_t volume; bool has_volume; bool empty, has_empty; }; + /** + * This is a move-only object that tracks temporary references to + * index space expressions that are returned from region tree ops + */ + class IndexSpaceExprRef { + public: + IndexSpaceExprRef(void) : expr(NULL) { } + IndexSpaceExprRef(IndexSpaceExpression *e, ReferenceMutator *m = NULL) + : expr(e) + { + if (expr != NULL) + { + if (m == NULL) + { + LocalReferenceMutator local_mutator; + expr->add_base_expression_reference(LIVE_EXPR_REF, &local_mutator); + } + else + expr->add_base_expression_reference(LIVE_EXPR_REF, m); + } + } + IndexSpaceExprRef(const IndexSpaceExprRef &rhs) = delete; + IndexSpaceExprRef(IndexSpaceExprRef &&rhs) + : expr(rhs.expr) + { + rhs.expr = NULL; + } + ~IndexSpaceExprRef(void) + { + if ((expr != NULL) && + expr->remove_base_expression_reference(LIVE_EXPR_REF)) + delete expr; + } + IndexSpaceExprRef& operator=(const IndexSpaceExprRef &rhs) = delete; + inline IndexSpaceExprRef& operator=(IndexSpaceExprRef &&rhs) + { + if ((expr != NULL) && + expr->remove_base_expression_reference(LIVE_EXPR_REF)) + delete expr; + expr = rhs.expr; + rhs.expr = NULL; + return *this; + } + public: + inline bool operator==(const IndexSpaceExprRef &rhs) const + { + if (expr == NULL) + return (rhs.expr == NULL); + if (rhs.expr == NULL) + return false; + return (expr->expr_id == rhs.expr->expr_id); + } + inline bool operator<(const IndexSpaceExprRef &rhs) const + { + if (expr == NULL) + return (rhs.expr != NULL); + if (rhs.expr == NULL) + return false; + return (expr->expr_id < rhs.expr->expr_id); + } + inline IndexSpaceExpression* operator->(void) { return expr; } + inline IndexSpaceExpression* operator&(void) { return expr; } + protected: + IndexSpaceExpression *expr; + }; + class IndexSpaceOperation : public IndexSpaceExpression, public DistributedCollectable { public: @@ -1330,6 +1422,17 @@ namespace Legion { REMOTE_EXPRESSION_KIND, INSTANCE_EXPRESSION_KIND, }; + public: + class InactiveFunctor { + public: + InactiveFunctor(IndexSpaceOperation *o, ReferenceMutator *m) + : op(o), mutator(m) { } + public: + void apply(AddressSpaceID target); + public: + IndexSpaceOperation *const op; + ReferenceMutator *const mutator; + }; public: IndexSpaceOperation(TypeTag tag, OperationKind kind, RegionTreeForest *ctx); @@ -1340,9 +1443,8 @@ namespace Legion { public: virtual void notify_active(ReferenceMutator *mutator); virtual void notify_inactive(ReferenceMutator *mutator); - // We should never be using valid references for index space expressions - virtual void notify_valid(ReferenceMutator *mutator) { assert(false); } - virtual void notify_invalid(ReferenceMutator *mutator) { assert(false); } + virtual void notify_valid(ReferenceMutator *mutator); + virtual void notify_invalid(ReferenceMutator *mutator); public: virtual ApEvent get_expr_index_space(void *result, TypeTag tag, bool need_tight_result) = 0; @@ -1354,9 +1456,14 @@ namespace Legion { virtual void pack_expression_value(Serializer &rez, AddressSpaceID target) = 0; public: +#ifdef DEBUG_LEGION + virtual bool is_valid(void) const { return check_valid(); } +#endif virtual DistributedID get_distributed_id(void) const { return did; } virtual bool try_add_canonical_reference(DistributedID source); virtual bool remove_canonical_reference(DistributedID source); + virtual bool try_add_live_reference(ReferenceSource source); + virtual bool remove_live_reference(ReferenceSource source); virtual void add_base_expression_reference(ReferenceSource source, ReferenceMutator *mutator = NULL, unsigned count = 1); virtual void add_nested_expression_reference(DistributedID source, @@ -1372,20 +1479,22 @@ namespace Legion { virtual bool remove_tree_expression_reference(DistributedID source, unsigned count = 1); public: - virtual bool remove_operation(RegionTreeForest *forest) = 0; + virtual bool invalidate_operation(void) = 0; + virtual void remove_operation(void) = 0; virtual IndexSpaceNode* create_node(IndexSpace handle, DistributedID did, RtEvent initialized, std::set *applied) = 0; - public: - void invalidate_operation(std::deque &to_remove); public: RegionTreeForest *const context; IndexSpaceOperation *const origin_expr; const OperationKind op_kind; protected: mutable LocalLock inter_lock; + std::atomic invalidated; +#ifdef DEBUG_LEGION private: - int invalidated; + bool tree_active; +#endif }; template @@ -1406,7 +1515,8 @@ namespace Legion { virtual void pack_expression(Serializer &rez, AddressSpaceID target); virtual void pack_expression_value(Serializer &rez, AddressSpaceID target) = 0; - virtual bool remove_operation(RegionTreeForest *forest) = 0; + virtual bool invalidate_operation(void) = 0; + virtual void remove_operation(void) = 0; virtual IndexSpaceNode* create_node(IndexSpace handle, DistributedID did, RtEvent initialized, std::set *applied); @@ -1504,7 +1614,8 @@ namespace Legion { IndexSpaceUnion& operator=(const IndexSpaceUnion &rhs); public: virtual void pack_expression_value(Serializer &rez,AddressSpaceID target); - virtual bool remove_operation(RegionTreeForest *forest); + virtual bool invalidate_operation(void); + virtual void remove_operation(void); protected: const std::vector sub_expressions; }; @@ -1513,7 +1624,7 @@ namespace Legion { public: UnionOpCreator(RegionTreeForest *f, TypeTag t, const std::vector &e) - : forest(f), type_tag(t), exprs(e) { } + : OperationCreator(f), type_tag(t), exprs(e) { } public: template static inline void demux(UnionOpCreator *creator) @@ -1525,7 +1636,6 @@ namespace Legion { virtual void create_operation(void) { NT_TemplateHelper::demux(type_tag, this); } public: - RegionTreeForest *const forest; const TypeTag type_tag; const std::vector &exprs; }; @@ -1544,7 +1654,8 @@ namespace Legion { IndexSpaceIntersection& operator=(const IndexSpaceIntersection &rhs); public: virtual void pack_expression_value(Serializer &rez,AddressSpaceID target); - virtual bool remove_operation(RegionTreeForest *forest); + virtual bool invalidate_operation(void); + virtual void remove_operation(void); protected: const std::vector sub_expressions; }; @@ -1553,7 +1664,7 @@ namespace Legion { public: IntersectionOpCreator(RegionTreeForest *f, TypeTag t, const std::vector &e) - : forest(f), type_tag(t), exprs(e) { } + : OperationCreator(f), type_tag(t), exprs(e) { } public: template static inline void demux(IntersectionOpCreator *creator) @@ -1565,7 +1676,6 @@ namespace Legion { virtual void create_operation(void) { NT_TemplateHelper::demux(type_tag, this); } public: - RegionTreeForest *const forest; const TypeTag type_tag; const std::vector &exprs; }; @@ -1584,7 +1694,8 @@ namespace Legion { IndexSpaceDifference& operator=(const IndexSpaceDifference &rhs); public: virtual void pack_expression_value(Serializer &rez,AddressSpaceID target); - virtual bool remove_operation(RegionTreeForest *forest); + virtual bool invalidate_operation(void); + virtual void remove_operation(void); protected: IndexSpaceExpression *const lhs; IndexSpaceExpression *const rhs; @@ -1594,7 +1705,7 @@ namespace Legion { public: DifferenceOpCreator(RegionTreeForest *f, TypeTag t, IndexSpaceExpression *l, IndexSpaceExpression *r) - : forest(f), type_tag(t), lhs(l), rhs(r) { } + : OperationCreator(f), type_tag(t), lhs(l), rhs(r) { } public: template static inline void demux(DifferenceOpCreator *creator) @@ -1606,7 +1717,6 @@ namespace Legion { virtual void create_operation(void) { NT_TemplateHelper::demux(type_tag, this); } public: - RegionTreeForest *const forest; const TypeTag type_tag; IndexSpaceExpression *const lhs; IndexSpaceExpression *const rhs; @@ -1631,7 +1741,8 @@ namespace Legion { InstanceExpression& operator=(const InstanceExpression &rhs); public: virtual void pack_expression_value(Serializer &rez,AddressSpaceID target); - virtual bool remove_operation(RegionTreeForest *forest); + virtual bool invalidate_operation(void); + virtual void remove_operation(void); }; /** @@ -1653,14 +1764,14 @@ namespace Legion { RemoteExpression& operator=(const RemoteExpression &op); public: virtual void pack_expression_value(Serializer &rez,AddressSpaceID target); - virtual bool remove_operation(RegionTreeForest *forest); + virtual bool invalidate_operation(void); + virtual void remove_operation(void); }; - class RemoteExpressionCreator : public OperationCreator { + class RemoteExpressionCreator { public: - RemoteExpressionCreator(RegionTreeForest *f, Deserializer &d) - : forest(f), type_tag(unpack_type_tag(d)), derez(d) - { NT_TemplateHelper::demux(type_tag, this); } + RemoteExpressionCreator(RegionTreeForest *f, TypeTag t, Deserializer &d) + : forest(f), type_tag(t), derez(d), operation(NULL) { } public: template static inline void demux(RemoteExpressionCreator *creator) @@ -1673,24 +1784,18 @@ namespace Legion { creator->derez.deserialize(owner_space); IndexSpaceOperation *origin; creator->derez.deserialize(origin); - creator->produce( +#ifdef DEBUG_LEGION + assert(creator->operation == NULL); +#endif + creator->operation = new RemoteExpression(creator->forest, expr_id, did, - owner_space, origin, creator->type_tag, creator->derez)); - } - public: - // Nothing to do for this - virtual void create_operation(void) { } - public: - static inline TypeTag unpack_type_tag(Deserializer &derez) - { - TypeTag tag; - derez.deserialize(tag); - return tag; + owner_space, origin, creator->type_tag, creator->derez); } public: RegionTreeForest *const forest; const TypeTag type_tag; Deserializer &derez; + IndexSpaceOperation *operation; }; /** @@ -1867,10 +1972,10 @@ namespace Legion { const AddressSpaceID source; Serializer &rez; }; - class InvalidFunctor { + class InactiveFunctor { public: - InvalidFunctor(IndexSpaceNode *n, ReferenceMutator *m, - std::map &effects) + InactiveFunctor(IndexSpaceNode *n, ReferenceMutator *m, + std::map &effects) : node(n), mutator(m), send_effects(effects) { } public: void apply(AddressSpaceID target); @@ -1890,6 +1995,7 @@ namespace Legion { public: IndexSpaceNode& operator=(const IndexSpaceNode &rhs); public: + virtual void notify_active(ReferenceMutator *mutator); virtual void notify_valid(ReferenceMutator *mutator); virtual void notify_invalid(ReferenceMutator *mutator); virtual void notify_inactive(ReferenceMutator *mutator); @@ -1977,9 +2083,14 @@ namespace Legion { virtual void pack_expression(Serializer &rez, AddressSpaceID target); virtual void pack_expression_value(Serializer &rez,AddressSpaceID target); public: +#ifdef DEBUG_LEGION + virtual bool is_valid(void) const { return check_valid(); } +#endif virtual DistributedID get_distributed_id(void) const { return did; } virtual bool try_add_canonical_reference(DistributedID source); virtual bool remove_canonical_reference(DistributedID source); + virtual bool try_add_live_reference(ReferenceSource source); + virtual bool remove_live_reference(ReferenceSource source); virtual void add_base_expression_reference(ReferenceSource source, ReferenceMutator *mutator = NULL, unsigned count = 1); virtual void add_nested_expression_reference(DistributedID source, @@ -1995,7 +2106,6 @@ namespace Legion { virtual bool remove_tree_expression_reference(DistributedID source, unsigned count = 1); public: - virtual bool remove_operation(RegionTreeForest *forest); virtual IndexSpaceNode* create_node(IndexSpace handle, DistributedID did, RtEvent initialized, std::set *applied) = 0; @@ -2140,6 +2250,10 @@ namespace Legion { bool tight_index_space; // Keep track of whether we're still valid on the owner bool tree_valid; +#ifdef DEBUG_LEGION + // Keep track of whether we are active, should only happen once + bool tree_active; +#endif }; /** diff --git a/runtime/legion/region_tree.inl b/runtime/legion/region_tree.inl index 93a0afe5fd..7c5ed0420c 100644 --- a/runtime/legion/region_tree.inl +++ b/runtime/legion/region_tree.inl @@ -1126,11 +1126,11 @@ namespace Legion { { if (rects == NULL) { - if (!space.dense()) + if (space.dense()) + return this; + else // Make a new expression for the bounding box return new InstanceExpression(&space.bounds,1/*size*/,context); - else // if we're dense we can just use ourselves - return this; } else { @@ -1867,6 +1867,8 @@ namespace Legion { sub_expressions(to_union) //-------------------------------------------------------------------------- { + // Add an resource ref that will be removed by the OperationCreator + this->add_base_resource_ref(REGION_TREE_REF); std::set preconditions; std::vector > spaces(sub_expressions.size()); for (unsigned idx = 0; idx < sub_expressions.size(); idx++) @@ -1876,7 +1878,7 @@ namespace Legion { assert(sub->get_canonical_expression(this->context) == sub); #endif // Add the parent and the reference - sub->add_parent_operation(this); + sub->add_derived_operation(this); sub->add_tree_expression_reference(this->did); // Then get the realm index space expression ApEvent precondition = sub->get_expr_index_space( @@ -1900,7 +1902,7 @@ namespace Legion { if (!this->realm_index_space_ready.has_triggered() || !valid_event.has_triggered()) { - IndexSpaceExpression::TightenIndexSpaceArgs args(this); + IndexSpaceExpression::TightenIndexSpaceArgs args(this, this); if (!this->realm_index_space_ready.has_triggered()) { if (!valid_event.has_triggered()) @@ -1987,18 +1989,25 @@ namespace Legion { //-------------------------------------------------------------------------- template - bool IndexSpaceUnion::remove_operation(RegionTreeForest *forest) + bool IndexSpaceUnion::invalidate_operation(void) //-------------------------------------------------------------------------- { + // Make sure we only do this one time + if (this->invalidated.fetch_add(1) > 0) + return false; // Remove the parent operation from all the sub expressions for (unsigned idx = 0; idx < sub_expressions.size(); idx++) - sub_expressions[idx]->remove_parent_operation(this); - // Then remove ourselves from the tree - if (forest != NULL) - forest->remove_union_operation(this, sub_expressions); - // Remove our expression reference added by invalidate_operation - // and return true if we should be deleted - return this->remove_base_resource_ref(IS_EXPR_REF); + sub_expressions[idx]->remove_derived_operation(this); + // We were successfully removed + return true; + } + + //-------------------------------------------------------------------------- + template + void IndexSpaceUnion::remove_operation(void) + //-------------------------------------------------------------------------- + { + this->context->remove_union_operation(this, sub_expressions); } //-------------------------------------------------------------------------- @@ -2010,6 +2019,8 @@ namespace Legion { sub_expressions(to_inter) //-------------------------------------------------------------------------- { + // Add an resource ref that will be removed by the OperationCreator + this->add_base_resource_ref(REGION_TREE_REF); std::set preconditions; std::vector > spaces(sub_expressions.size()); for (unsigned idx = 0; idx < sub_expressions.size(); idx++) @@ -2019,7 +2030,7 @@ namespace Legion { assert(sub->get_canonical_expression(this->context) == sub); #endif // Add the parent and the reference - sub->add_parent_operation(this); + sub->add_derived_operation(this); sub->add_tree_expression_reference(this->did); ApEvent precondition = sub->get_expr_index_space( &spaces[idx], this->type_tag, false/*need tight result*/); @@ -2042,7 +2053,7 @@ namespace Legion { if (!this->realm_index_space_ready.has_triggered() || !valid_event.has_triggered()) { - IndexSpaceExpression::TightenIndexSpaceArgs args(this); + IndexSpaceExpression::TightenIndexSpaceArgs args(this, this); if (!this->realm_index_space_ready.has_triggered()) { if (!valid_event.has_triggered()) @@ -2130,19 +2141,25 @@ namespace Legion { //-------------------------------------------------------------------------- template - bool IndexSpaceIntersection::remove_operation( - RegionTreeForest *forest) + bool IndexSpaceIntersection::invalidate_operation(void) //-------------------------------------------------------------------------- { + // Make sure we only do this one time + if (this->invalidated.fetch_add(1) > 0) + return false; // Remove the parent operation from all the sub expressions for (unsigned idx = 0; idx < sub_expressions.size(); idx++) - sub_expressions[idx]->remove_parent_operation(this); - // Then remove ourselves from the tree - if (forest != NULL) - forest->remove_intersection_operation(this, sub_expressions); - // Remove our expression reference added by invalidate_operation - // and return true if we should be deleted - return this->remove_base_resource_ref(IS_EXPR_REF); + sub_expressions[idx]->remove_derived_operation(this); + // We were successfully removed + return true; + } + + //-------------------------------------------------------------------------- + template + void IndexSpaceIntersection::remove_operation(void) + //-------------------------------------------------------------------------- + { + this->context->remove_intersection_operation(this, sub_expressions); } //-------------------------------------------------------------------------- @@ -2153,6 +2170,8 @@ namespace Legion { , lhs(l), rhs(r) //-------------------------------------------------------------------------- { + // Add an resource ref that will be removed by the OperationCreator + this->add_base_resource_ref(REGION_TREE_REF); #ifdef DEBUG_LEGION assert(lhs->get_canonical_expression(this->context) == lhs); assert(rhs->get_canonical_expression(this->context) == rhs); @@ -2160,7 +2179,7 @@ namespace Legion { if (lhs == rhs) { // Special case for when the expressions are the same - lhs->add_parent_operation(this); + lhs->add_derived_operation(this); lhs->add_tree_expression_reference(this->did); this->realm_index_space = Realm::IndexSpace::make_empty(); this->tight_index_space = Realm::IndexSpace::make_empty(); @@ -2171,8 +2190,8 @@ namespace Legion { { Realm::IndexSpace lhs_space, rhs_space; // Add the parent and the references - lhs->add_parent_operation(this); - rhs->add_parent_operation(this); + lhs->add_derived_operation(this); + rhs->add_derived_operation(this); lhs->add_tree_expression_reference(this->did); rhs->add_tree_expression_reference(this->did); ApEvent left_ready = @@ -2195,7 +2214,7 @@ namespace Legion { if (!this->realm_index_space_ready.has_triggered() || !valid_event.has_triggered()) { - IndexSpaceExpression::TightenIndexSpaceArgs args(this); + IndexSpaceExpression::TightenIndexSpaceArgs args(this, this); if (!this->realm_index_space_ready.has_triggered()) { if (!valid_event.has_triggered()) @@ -2282,20 +2301,28 @@ namespace Legion { //-------------------------------------------------------------------------- template - bool IndexSpaceDifference::remove_operation(RegionTreeForest *forest) + bool IndexSpaceDifference::invalidate_operation(void) //-------------------------------------------------------------------------- { + // Make sure we only do this one time + if (this->invalidated.fetch_add(1) > 0) + return false; // Remove the parent operation from all the sub expressions if (lhs != NULL) - lhs->remove_parent_operation(this); + lhs->remove_derived_operation(this); if ((rhs != NULL) && (lhs != rhs)) - rhs->remove_parent_operation(this); - // Then remove ourselves from the tree - if ((forest != NULL) && (lhs != NULL) && (rhs != NULL)) - forest->remove_subtraction_operation(this, lhs, rhs); - // Remove our expression reference added by invalidate_operation - // and return true if we should be deleted - return this->remove_base_resource_ref(IS_EXPR_REF); + rhs->remove_derived_operation(this); + // We were successfully removed + return true; + } + + //-------------------------------------------------------------------------- + template + void IndexSpaceDifference::remove_operation(void) + //-------------------------------------------------------------------------- + { + if ((lhs != NULL) && (rhs != NULL)) + this->context->remove_subtraction_operation(this, lhs, rhs); } ///////////////////////////////////////////////////////////// @@ -2310,6 +2337,11 @@ namespace Legion { IndexSpaceOperation::INSTANCE_EXPRESSION_KIND, forest) //-------------------------------------------------------------------------- { + // This is another kind of live expression made by the region tree + this->add_base_expression_reference(LIVE_EXPR_REF); + if (implicit_live_expressions == NULL) + implicit_live_expressions = new std::vector; + implicit_live_expressions->emplace_back(this); #ifdef DEBUG_LEGION assert(num_rects > 0); #endif @@ -2322,7 +2354,7 @@ namespace Legion { const RtEvent valid_event(this->realm_index_space.make_valid()); if (!valid_event.has_triggered()) { - IndexSpaceExpression::TightenIndexSpaceArgs args(this); + IndexSpaceExpression::TightenIndexSpaceArgs args(this, this); this->tight_index_space_ready = forest->runtime->issue_runtime_meta_task(args, LG_LATENCY_WORK_PRIORITY, valid_event); @@ -2418,7 +2450,7 @@ namespace Legion { //-------------------------------------------------------------------------- template - bool InstanceExpression::remove_operation(RegionTreeForest *forest) + bool InstanceExpression::invalidate_operation(void) //-------------------------------------------------------------------------- { // should never be called @@ -2426,6 +2458,14 @@ namespace Legion { return false; } + //-------------------------------------------------------------------------- + template + void InstanceExpression::remove_operation(void) + //-------------------------------------------------------------------------- + { + // Nothing to do here since we're not in the region tree + } + ///////////////////////////////////////////////////////////// // Remote Expression ///////////////////////////////////////////////////////////// @@ -2481,7 +2521,7 @@ namespace Legion { //-------------------------------------------------------------------------- template - bool RemoteExpression::remove_operation(RegionTreeForest *forest) + bool RemoteExpression::invalidate_operation(void) //-------------------------------------------------------------------------- { // should never be called @@ -2489,6 +2529,15 @@ namespace Legion { return false; } + //-------------------------------------------------------------------------- + template + void RemoteExpression::remove_operation(void) + //-------------------------------------------------------------------------- + { + // should never be called + assert(false); + } + ///////////////////////////////////////////////////////////// // Templated Index Space Node ///////////////////////////////////////////////////////////// @@ -2688,7 +2737,7 @@ namespace Legion { if (!index_space_ready.has_triggered() || !valid_event.has_triggered()) { // If this index space isn't ready yet, then we have to defer this - TightenIndexSpaceArgs args(this); + TightenIndexSpaceArgs args(this, this); if (!index_space_ready.has_triggered()) { if (!valid_event.has_triggered()) diff --git a/runtime/legion/runtime.cc b/runtime/legion/runtime.cc index b34322e005..978b1c2706 100644 --- a/runtime/legion/runtime.cc +++ b/runtime/legion/runtime.cc @@ -75,6 +75,7 @@ namespace Legion { __thread AutoLock *local_lock_list = NULL; __thread UniqueID implicit_provenance = 0; __thread unsigned inside_registration_callback = NO_REGISTRATION_CALLBACK; + __thread std::vector *implicit_live_expressions=NULL; const LgEvent LgEvent::NO_LG_EVENT = LgEvent(); const ApEvent ApEvent::NO_AP_EVENT = ApEvent(); @@ -13143,11 +13144,25 @@ namespace Legion { bool Runtime::is_index_partition_disjoint(Context ctx, IndexPartition p) //-------------------------------------------------------------------------- { +#ifdef DEBUG_LEGION + assert(implicit_live_expressions == NULL); +#endif if (ctx != DUMMY_CONTEXT) ctx->begin_runtime_call(); - bool result = forest->is_index_partition_disjoint(p); + const bool result = forest->is_index_partition_disjoint(p); if (ctx != DUMMY_CONTEXT) ctx->end_runtime_call(); + else if (implicit_live_expressions != NULL) + { + // Remove references to any live index space expressions we have + for (std::vector::const_iterator it = + implicit_live_expressions->begin(); it != + implicit_live_expressions->end(); it++) + if ((*it)->remove_base_expression_reference(LIVE_EXPR_REF)) + delete (*it); + delete implicit_live_expressions; + implicit_live_expressions = NULL; + } return result; } @@ -13155,18 +13170,47 @@ namespace Legion { bool Runtime::is_index_partition_disjoint(IndexPartition p) //-------------------------------------------------------------------------- { - return forest->is_index_partition_disjoint(p); +#ifdef DEBUG_LEGION + assert(implicit_live_expressions == NULL); +#endif + const bool result = forest->is_index_partition_disjoint(p); + if (implicit_live_expressions != NULL) + { + // Remove references to any live index space expressions we have + for (std::vector::const_iterator it = + implicit_live_expressions->begin(); it != + implicit_live_expressions->end(); it++) + if ((*it)->remove_base_expression_reference(LIVE_EXPR_REF)) + delete (*it); + delete implicit_live_expressions; + implicit_live_expressions = NULL; + } + return result; } //-------------------------------------------------------------------------- bool Runtime::is_index_partition_complete(Context ctx, IndexPartition p) //-------------------------------------------------------------------------- { +#ifdef DEBUG_LEGION + assert(implicit_live_expressions == NULL); +#endif if (ctx != DUMMY_CONTEXT) ctx->begin_runtime_call(); bool result = forest->is_index_partition_complete(p); if (ctx != DUMMY_CONTEXT) ctx->end_runtime_call(); + else if (implicit_live_expressions != NULL) + { + // Remove references to any live index space expressions we have + for (std::vector::const_iterator it = + implicit_live_expressions->begin(); it != + implicit_live_expressions->end(); it++) + if ((*it)->remove_base_expression_reference(LIVE_EXPR_REF)) + delete (*it); + delete implicit_live_expressions; + implicit_live_expressions = NULL; + } return result; } @@ -13174,7 +13218,22 @@ namespace Legion { bool Runtime::is_index_partition_complete(IndexPartition p) //-------------------------------------------------------------------------- { - return forest->is_index_partition_complete(p); +#ifdef DEBUG_LEGION + assert(implicit_live_expressions == NULL); +#endif + const bool result = forest->is_index_partition_complete(p); + if (implicit_live_expressions != NULL) + { + // Remove references to any live index space expressions we have + for (std::vector::const_iterator it = + implicit_live_expressions->begin(); it != + implicit_live_expressions->end(); it++) + if ((*it)->remove_base_expression_reference(LIVE_EXPR_REF)) + delete (*it); + delete implicit_live_expressions; + implicit_live_expressions = NULL; + } + return result; } //-------------------------------------------------------------------------- @@ -24370,6 +24429,7 @@ namespace Legion { if (!runtime->local_utils.empty()) assert(implicit_context == NULL); // this better hold #endif + assert(implicit_live_expressions == NULL); #endif implicit_runtime = runtime; // We immediately bump the priority of all meta-tasks once they start @@ -25000,6 +25060,17 @@ namespace Legion { default: assert(false); // should never get here } + if (implicit_live_expressions != NULL) + { + // Remove references to any live index space expressions we have + for (std::vector::const_iterator it = + implicit_live_expressions->begin(); it != + implicit_live_expressions->end(); it++) + if ((*it)->remove_base_expression_reference(LIVE_EXPR_REF)) + delete (*it); + delete implicit_live_expressions; + implicit_live_expressions = NULL; + } #ifdef DEBUG_LEGION if (tid < LG_BEGIN_SHUTDOWN_TASK_IDS) runtime->decrement_total_outstanding_tasks(tid, true/*meta*/); @@ -25077,6 +25148,7 @@ namespace Legion { Runtime *runtime = *((Runtime**)userdata); #ifdef DEBUG_LEGION assert(userlen == sizeof(Runtime**)); + assert(implicit_live_expressions == NULL); #endif implicit_runtime = runtime; // We immediately bump the priority of all meta-tasks once they start @@ -25116,6 +25188,17 @@ namespace Legion { default: assert(false); // should never get here } + if (implicit_live_expressions != NULL) + { + // Remove references to any live index space expressions we have + for (std::vector::const_iterator it = + implicit_live_expressions->begin(); it != + implicit_live_expressions->end(); it++) + if ((*it)->remove_base_expression_reference(LIVE_EXPR_REF)) + delete (*it); + delete implicit_live_expressions; + implicit_live_expressions = NULL; + } #ifdef DEBUG_LEGION runtime->decrement_total_outstanding_tasks(tid, true/*meta*/); #else From f068b35c7c70ac7c0177719456abec454ddf0fe1 Mon Sep 17 00:00:00 2001 From: Mike Bauer Date: Fri, 10 Dec 2021 01:28:18 -0800 Subject: [PATCH 12/36] legion: add missing distributed id encoding --- runtime/legion/region_tree.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime/legion/region_tree.cc b/runtime/legion/region_tree.cc index 249beb85c8..4eb323fd36 100644 --- a/runtime/legion/region_tree.cc +++ b/runtime/legion/region_tree.cc @@ -6880,8 +6880,8 @@ namespace Legion { IndexSpaceOperation::IndexSpaceOperation(TypeTag tag, OperationKind kind, RegionTreeForest *ctx) : IndexSpaceExpression(tag, ctx->runtime, inter_lock), - DistributedCollectable(ctx->runtime, - ctx->runtime->get_available_distributed_id(), + DistributedCollectable(ctx->runtime, LEGION_DISTRIBUTED_HELP_ENCODE( + ctx->runtime->get_available_distributed_id(), INDEX_EXPR_NODE_DC), ctx->runtime->address_space), context(ctx), origin_expr(this), op_kind(kind), invalidated(0) #ifdef DEBUG_LEGION From d4a29b337e0ba4d1417f9914dd017b842b567999 Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Mon, 6 Dec 2021 12:17:35 -0800 Subject: [PATCH 13/36] realm: look up modules by name --- runtime/realm/runtime_impl.cc | 28 +++++++++++++++++++++++++++- runtime/realm/runtime_impl.h | 13 +++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/runtime/realm/runtime_impl.cc b/runtime/realm/runtime_impl.cc index c19a88c70d..437bcd404d 100644 --- a/runtime/realm/runtime_impl.cc +++ b/runtime/realm/runtime_impl.cc @@ -810,7 +810,8 @@ namespace Realm { sampling_profiler(true /*system default*/), num_local_memories(0), num_local_ib_memories(0), num_local_processors(0), - module_registrar(this) + module_registrar(this), + modules_created(false) { machine = new MachineImpl; } @@ -907,6 +908,30 @@ namespace Realm { return code_translators; } + Module *RuntimeImpl::get_module_untyped(const char *name) const + { + if(!modules_created) { + log_runtime.fatal() << "request for '" << name + << "' module before all modules have been created"; + abort(); + } + + // TODO: worth building a map here instead? + for(std::vector::const_iterator it = modules.begin(); + it != modules.end(); + ++it) + if(!strcmp(name, (*it)->get_name().c_str())) + return *it; + + for(std::vector::const_iterator it = network_modules.begin(); + it != network_modules.end(); + ++it) + if(!strcmp(name, (*it)->get_name().c_str())) + return *it; + + return 0; + } + static void add_proc_mem_affinities(MachineImpl *machine, const std::set& procs, const std::set& mems, @@ -1187,6 +1212,7 @@ namespace Realm { // now load modules module_registrar.create_static_modules(cmdline, modules); module_registrar.create_dynamic_modules(cmdline, modules); + modules_created = true; PartitioningOpQueue::configure_from_cmdline(cmdline); diff --git a/runtime/realm/runtime_impl.h b/runtime/realm/runtime_impl.h index 2c11ef60dc..f42b71010d 100644 --- a/runtime/realm/runtime_impl.h +++ b/runtime/realm/runtime_impl.h @@ -361,12 +361,25 @@ namespace Realm { const std::vector& get_code_translators(void) const; + template + T *get_module(const char *name) const + { + Module *mod = get_module_untyped(name); + if(mod) + return checked_cast(mod); + else + return 0; + } + protected: + Module *get_module_untyped(const char *name) const; + ID::IDType num_local_memories, num_local_ib_memories, num_local_processors; NetworkSegment reg_ib_mem_segment; NetworkSegment reg_mem_segment; ModuleRegistrar module_registrar; + bool modules_created; std::vector modules; std::vector code_translators; From 43f528e2a5b73e1d56ed6560531e8aec38905587 Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Wed, 8 Dec 2021 21:12:14 -0800 Subject: [PATCH 14/36] ci: set LD_LIBRARY_PATH for cuda libs --- .gitlab-ci.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d661935f72..363c73c3fe 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -335,6 +335,11 @@ before_script: export LAUNCHER="mpirun -n 2 -x TERRA_PATH -x INCLUDE_PATH -x LD_LIBRARY_PATH -x LG_RT_DIR -x USE_RDIR" fi fi + - | + if [[ "$USE_CUDA" -eq 1 ]]; then + # make sure dynamic loading finds the right libcudart.so + export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA}/lib64" + fi - | if [[ "$USE_KOKKOS" -eq 1 ]]; then # Kokkos requires cmake 3.16 - grab it here so the default can remain @@ -374,9 +379,6 @@ before_script: # detection as well as for nvcc_wrapper (if used) PATH=$PATH:${CUDA}/bin - # libkokkoscore.so doesn't remember where libcudart.so is? - export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA}/lib64" - if [[ "$CXX" == *clang* ]]; then # clang speaks cuda, so can be used as Kokkos' "C++" compiler export KOKKOS_CXX_COMPILER="$CXX" From 68b4672be10f5054f397e272e6e52e14d962046c Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Mon, 6 Dec 2021 16:20:36 -0800 Subject: [PATCH 15/36] realm: support runtime loading of libcuda.so --- CMakeLists.txt | 8 + cmake/realm_defines.h.in | 1 + runtime/CMakeLists.txt | 8 +- runtime/realm/cuda/cuda_internal.cc | 188 +++++---- runtime/realm/cuda/cuda_internal.h | 107 ++++- runtime/realm/cuda/cuda_module.cc | 621 +++++++++++++++++++--------- 6 files changed, 645 insertions(+), 288 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 86dccb209c..e5134d8cb6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -302,6 +302,7 @@ endif() option(Legion_USE_CUDA "Enable support for the CUDA runtime" OFF) if(Legion_USE_CUDA) set(Legion_CUDA_ARCH "" CACHE STRING "Comma-separated list of CUDA architectures to build for (e.g. 60,70)") + option(Legion_CUDA_DYNAMIC_LOAD "Load cuda libraries at runtime" OFF) if(NOT BUILD_SHARED_LIBS) set(CUDA_USE_STATIC_CUDA_RUNTIME ON) @@ -341,6 +342,13 @@ if(Legion_USE_CUDA) if(Legion_HIJACK_CUDART) set(REALM_USE_CUDART_HIJACK ON) endif() + if(Legion_CUDA_DYNAMIC_LOAD) + # not compatible with runtime hijack + if(Legion_HIJACK_CUDART) + message(FATAL_ERROR "Dynamic CUDA library loading (Legion_CUDA_DYNAMIC_LOAD) is not compatible with CUDA runtime hijack (Legion_HIJACK_CUDART)") + endif() + set(REALM_CUDA_DYNAMIC_LOAD ON) + endif() endif() #------------------------------------------------------------------------------# diff --git a/cmake/realm_defines.h.in b/cmake/realm_defines.h.in index 57a56b1038..90d1fa70f0 100644 --- a/cmake/realm_defines.h.in +++ b/cmake/realm_defines.h.in @@ -31,6 +31,7 @@ #cmakedefine REALM_USE_CUDA #cmakedefine REALM_USE_CUDART_HIJACK +#cmakedefine REALM_CUDA_DYNAMIC_LOAD #cmakedefine REALM_USE_HIP #cmakedefine REALM_USE_HIP_HIJACK diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 3ed6118d93..9f88290362 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -439,11 +439,15 @@ if(Legion_USE_CUDA) string(REGEX REPLACE "[^\;]*cudart[^\;]*(\;?)" "" CUDA_LIBRARIES "${CUDA_LIBRARIES}") set(CUDA_LIBRARIES ${CUDA_LIBRARIES} PARENT_SCOPE) else() - target_link_libraries(RealmRuntime PRIVATE ${CUDA_CUDART_LIBRARY}) + if(NOT REALM_CUDA_DYNAMIC_LOAD) + target_link_libraries(RealmRuntime PRIVATE ${CUDA_CUDART_LIBRARY}) + endif() endif() target_include_directories(RealmRuntime PRIVATE ${CUDA_INCLUDE_DIRS}) - target_link_libraries(RealmRuntime PRIVATE ${CUDA_CUDA_LIBRARY}) + if(NOT REALM_CUDA_DYNAMIC_LOAD) + target_link_libraries(RealmRuntime PRIVATE ${CUDA_CUDA_LIBRARY}) + endif() # for backwards compatibility in applications target_compile_definitions(RealmRuntime INTERFACE USE_CUDA) endif() diff --git a/runtime/realm/cuda/cuda_internal.cc b/runtime/realm/cuda/cuda_internal.cc index 6e694275d7..484c127abe 100644 --- a/runtime/realm/cuda/cuda_internal.cc +++ b/runtime/realm/cuda/cuda_internal.cc @@ -166,21 +166,24 @@ namespace Realm { // grr... prototypes of these differ slightly... if(in_gpu) { if(out_gpu) { - CHECK_CU( cuMemcpyDtoDAsync(static_cast(out_base + out_offset), - static_cast(in_base + in_offset), - bytes, - stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpyDtoDAsync) + (static_cast(out_base + out_offset), + static_cast(in_base + in_offset), + bytes, + stream->get_stream()) ); } else { - CHECK_CU( cuMemcpyDtoHAsync(reinterpret_cast(out_base + out_offset), - static_cast(in_base + in_offset), - bytes, - stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpyDtoHAsync) + (reinterpret_cast(out_base + out_offset), + static_cast(in_base + in_offset), + bytes, + stream->get_stream()) ); } } else { - CHECK_CU( cuMemcpyHtoDAsync(static_cast(out_base + out_offset), - reinterpret_cast(in_base + in_offset), - bytes, - stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpyHtoDAsync) + (static_cast(out_base + out_offset), + reinterpret_cast(in_base + in_offset), + bytes, + stream->get_stream()) ); } log_gpudma.info() << "gpu memcpy: dst=" << std::hex << (out_base + out_offset) @@ -268,7 +271,8 @@ namespace Realm { copy_info.WidthInBytes = contig_bytes; copy_info.Height = lines; - CHECK_CU( cuMemcpy2DAsync(©_info, stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpy2DAsync) + (©_info, stream->get_stream()) ); log_gpudma.info() << "gpu memcpy 2d: dst=" << std::hex << (out_base + out_offset) << std::dec @@ -348,7 +352,8 @@ namespace Realm { else copy_info.dstHost = reinterpret_cast(out_base + out_offset + (act_planes * out_pstride)); - CHECK_CU( cuMemcpy2DAsync(©_info, stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpy2DAsync) + (©_info, stream->get_stream()) ); act_planes++; if(work_until.is_expired()) @@ -701,20 +706,22 @@ namespace Realm { memcpy(&fill_u8, fill_data, 1); if(out_dim == 1) { size_t bytes = out_alc.remaining(0); - CHECK_CU( cuMemsetD8Async(CUdeviceptr(out_base + out_offset), - fill_u8, - bytes, - stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD8Async) + (CUdeviceptr(out_base + out_offset), + fill_u8, + bytes, + stream->get_stream()) ); out_alc.advance(0, bytes); total_bytes += bytes; } else { size_t bytes = out_alc.remaining(0); size_t lines = out_alc.remaining(1); - CHECK_CU( cuMemsetD2D8Async(CUdeviceptr(out_base + out_offset), - out_alc.get_stride(1), - fill_u8, - bytes, lines, - stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D8Async) + (CUdeviceptr(out_base + out_offset), + out_alc.get_stride(1), + fill_u8, + bytes, lines, + stream->get_stream()) ); out_alc.advance(1, lines); total_bytes += bytes * lines; } @@ -730,10 +737,11 @@ namespace Realm { #ifdef DEBUG_REALM assert((bytes & 1) == 0); #endif - CHECK_CU( cuMemsetD16Async(CUdeviceptr(out_base + out_offset), - fill_u16, - bytes >> 1, - stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD16Async) + (CUdeviceptr(out_base + out_offset), + fill_u16, + bytes >> 1, + stream->get_stream()) ); out_alc.advance(0, bytes); total_bytes += bytes; } else { @@ -743,11 +751,12 @@ namespace Realm { assert((bytes & 1) == 0); assert((out_alc.get_stride(1) & 1) == 0); #endif - CHECK_CU( cuMemsetD2D16Async(CUdeviceptr(out_base + out_offset), - out_alc.get_stride(1), - fill_u16, - bytes >> 1, lines, - stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D16Async) + (CUdeviceptr(out_base + out_offset), + out_alc.get_stride(1), + fill_u16, + bytes >> 1, lines, + stream->get_stream()) ); out_alc.advance(1, lines); total_bytes += bytes * lines; } @@ -763,10 +772,11 @@ namespace Realm { #ifdef DEBUG_REALM assert((bytes & 3) == 0); #endif - CHECK_CU( cuMemsetD32Async(CUdeviceptr(out_base + out_offset), - fill_u32, - bytes >> 2, - stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD32Async) + (CUdeviceptr(out_base + out_offset), + fill_u32, + bytes >> 2, + stream->get_stream()) ); out_alc.advance(0, bytes); total_bytes += bytes; } else { @@ -776,11 +786,12 @@ namespace Realm { assert((bytes & 3) == 0); assert((out_alc.get_stride(1) & 3) == 0); #endif - CHECK_CU( cuMemsetD2D32Async(CUdeviceptr(out_base + out_offset), - out_alc.get_stride(1), - fill_u32, - bytes >> 2, lines, - stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D32Async) + (CUdeviceptr(out_base + out_offset), + out_alc.get_stride(1), + fill_u32, + bytes >> 2, lines, + stream->get_stream()) ); out_alc.advance(1, lines); total_bytes += bytes * lines; } @@ -811,11 +822,12 @@ namespace Realm { memcpy(&fill_u32, reinterpret_cast(fill_data) + partial_bytes, 4); - CHECK_CU( cuMemsetD2D32Async(CUdeviceptr(out_base + out_offset + partial_bytes), - reduced_fill_size, - fill_u32, - 1 /*"width"*/, fill_elems /*"height"*/, - stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D32Async) + (CUdeviceptr(out_base + out_offset + partial_bytes), + reduced_fill_size, + fill_u32, + 1 /*"width"*/, fill_elems /*"height"*/, + stream->get_stream()) ); partial_bytes += 4; } } @@ -826,11 +838,12 @@ namespace Realm { memcpy(&fill_u16, reinterpret_cast(fill_data) + partial_bytes, 2); - CHECK_CU( cuMemsetD2D16Async(CUdeviceptr(out_base + out_offset + partial_bytes), - reduced_fill_size, - fill_u16, - 1 /*"width"*/, fill_elems /*"height"*/, - stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D16Async) + (CUdeviceptr(out_base + out_offset + partial_bytes), + reduced_fill_size, + fill_u16, + 1 /*"width"*/, fill_elems /*"height"*/, + stream->get_stream()) ); partial_bytes += 2; } } @@ -840,21 +853,23 @@ namespace Realm { memcpy(&fill_u8, reinterpret_cast(fill_data) + partial_bytes, 1); - CHECK_CU( cuMemsetD2D8Async(CUdeviceptr(out_base + out_offset + partial_bytes), - reduced_fill_size, - fill_u8, - 1 /*"width"*/, fill_elems /*"height"*/, - stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D8Async) + (CUdeviceptr(out_base + out_offset + partial_bytes), + reduced_fill_size, + fill_u8, + 1 /*"width"*/, fill_elems /*"height"*/, + stream->get_stream()) ); partial_bytes += 1; } while(fill_elems < elems) { size_t todo = std::min(fill_elems, elems - fill_elems); - CHECK_CU( cuMemcpyAsync(CUdeviceptr(out_base + out_offset + - (fill_elems * reduced_fill_size)), - CUdeviceptr(out_base + out_offset), - todo * reduced_fill_size, - stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpyAsync) + (CUdeviceptr(out_base + out_offset + + (fill_elems * reduced_fill_size)), + CUdeviceptr(out_base + out_offset), + todo * reduced_fill_size, + stream->get_stream()) ); fill_elems += todo; } @@ -886,7 +901,8 @@ namespace Realm { copy2d.dstDevice = CUdeviceptr(out_base + out_offset + (lines_done * lstride)); copy2d.Height = todo; - CHECK_CU( cuMemcpy2DAsync(©2d, stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpy2DAsync) + (©2d, stream->get_stream()) ); lines_done += todo; } @@ -928,7 +944,8 @@ namespace Realm { copy3d.dstDevice = CUdeviceptr(out_base + out_offset + (planes_done * pstride)); copy3d.Depth = todo; - CHECK_CU( cuMemcpy3DAsync(©3d, stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpy3DAsync) + (©3d, stream->get_stream()) ); planes_done += todo; } @@ -942,7 +959,8 @@ namespace Realm { for(size_t p = 1; p < planes; p++) { copy2d.dstDevice = CUdeviceptr(out_base + out_offset + (p * pstride)); - CHECK_CU( cuMemcpy2DAsync(©2d, stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpy2DAsync) + (©2d, stream->get_stream()) ); } } } @@ -1066,10 +1084,10 @@ namespace Realm { #if CUDA_VERSION >= 11000 // we can ask the runtime to perform the mapping for us int orig_device; - CHECK_CUDART( cudaGetDevice(&orig_device) ); - CHECK_CUDART( cudaSetDevice(gpu->info->index) ); - CHECK_CUDART( cudaGetFuncBySymbol(&kernel, host_proxy) ); - CHECK_CUDART( cudaSetDevice(orig_device) ); + CHECK_CUDART( CUDA_RUNTIME_FNPTR(cudaGetDevice)(&orig_device) ); + CHECK_CUDART( CUDA_RUNTIME_FNPTR(cudaSetDevice)(gpu->info->index) ); + CHECK_CUDART( CUDA_RUNTIME_FNPTR(cudaGetFuncBySymbol)(&kernel, host_proxy) ); + CHECK_CUDART( CUDA_RUNTIME_FNPTR(cudaSetDevice)(orig_device) ); #else // no way to ask the runtime to perform the mapping, so we'll have // to actually launch the kernels with the runtime API @@ -1225,13 +1243,14 @@ namespace Realm { CU_LAUNCH_PARAM_END }; - CHECK_CU( cuLaunchKernel(kernel, - blocks_per_grid, 1, 1, - threads_per_block, 1, 1, - 0 /*sharedmem*/, - stream->get_stream(), - 0 /*params*/, - extra) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuLaunchKernel) + (kernel, + blocks_per_grid, 1, 1, + threads_per_block, 1, 1, + 0 /*sharedmem*/, + stream->get_stream(), + 0 /*params*/, + extra) ); #else int orig_device; void *params[] = { @@ -1242,15 +1261,16 @@ namespace Realm { &args->count, args+1 }; - CHECK_CUDART( cudaGetDevice(&orig_device) ); - CHECK_CUDART( cudaSetDevice(channel->gpu->info->index) ); - CHECK_CUDART( cudaLaunchKernel(kernel_host_proxy, - dim3(blocks_per_grid, 1, 1), - dim3(threads_per_block, 1, 1), - params, - 0 /*sharedMem*/, - (cudaStream_t)(stream->get_stream())) ); - CHECK_CUDART( cudaSetDevice(orig_device) ); + CHECK_CUDART( CUDA_RUNTIME_FNPTR(cudaGetDevice)(&orig_device) ); + CHECK_CUDART( CUDA_RUNTIME_FNPTR(cudaSetDevice)(channel->gpu->info->index) ); + CHECK_CUDART( CUDA_RUNTIME_FNPTR(cudaLaunchKernel) + (kernel_host_proxy, + dim3(blocks_per_grid, 1, 1), + dim3(threads_per_block, 1, 1), + params, + 0 /*sharedMem*/, + (cudaStream_t)(stream->get_stream())) ); + CHECK_CUDART( CUDA_RUNTIME_FNPTR(cudaSetDevice)(orig_device) ); #endif // insert fence to track completion of reduction kernel diff --git a/runtime/realm/cuda/cuda_internal.h b/runtime/realm/cuda/cuda_internal.h index c383baa629..e12b11ee36 100644 --- a/runtime/realm/cuda/cuda_internal.h +++ b/runtime/realm/cuda/cuda_internal.h @@ -16,13 +16,18 @@ #ifndef REALM_CUDA_INTERNAL_H #define REALM_CUDA_INTERNAL_H +#include "realm/realm_config.h" + #include // We don't actually use the cuda runtime, but // we need all its declarations so we have all the right types #include -#include "realm/realm_config.h" +#if defined(REALM_CUDA_DYNAMIC_LOAD) && (CUDA_VERSION >= 11030) +#include +#endif + #include "realm/operation.h" #include "realm/threads.h" #include "realm/circ_queue.h" @@ -35,7 +40,7 @@ #define CHECK_CUDART(cmd) do { \ cudaError_t ret = (cmd); \ if(ret != cudaSuccess) { \ - fprintf(stderr, "CUDART: %s = %d (%s)\n", #cmd, ret, cudaGetErrorString(ret)); \ + fprintf(stderr, "CUDART: %s = %d (%s)\n", #cmd, ret, CUDA_RUNTIME_FNPTR(cudaGetErrorString)(ret)); \ assert(0); \ exit(1); \ } \ @@ -46,8 +51,8 @@ #define REPORT_CU_ERROR(cmd, ret) \ do { \ const char *name, *str; \ - cuGetErrorName(ret, &name); \ - cuGetErrorString(ret, &str); \ + CUDA_DRIVER_FNPTR(cuGetErrorName)(ret, &name); \ + CUDA_DRIVER_FNPTR(cuGetErrorString)(ret, &str); \ fprintf(stderr, "CU: %s = %d (%s): %s\n", cmd, ret, name, str); \ abort(); \ } while(0) @@ -986,6 +991,100 @@ namespace Realm { unsigned *lat_ret = 0); }; +#ifdef REALM_CUDA_DYNAMIC_LOAD + // cuda driver and/or runtime entry points + #define CUDA_DRIVER_FNPTR(name) (name ## _fnptr) + #define CUDA_RUNTIME_FNPTR(name) (name ## _fnptr) + + #define CUDA_DRIVER_APIS(__op__) \ + __op__(cuCtxEnablePeerAccess); \ + __op__(cuCtxGetFlags); \ + __op__(cuCtxPopCurrent); \ + __op__(cuCtxPushCurrent); \ + __op__(cuCtxSynchronize); \ + __op__(cuDeviceCanAccessPeer); \ + __op__(cuDeviceGet); \ + __op__(cuDeviceGetAttribute); \ + __op__(cuDeviceGetCount); \ + __op__(cuDeviceGetName); \ + __op__(cuDevicePrimaryCtxRelease); \ + __op__(cuDevicePrimaryCtxRetain); \ + __op__(cuDevicePrimaryCtxSetFlags); \ + __op__(cuDeviceTotalMem); \ + __op__(cuEventCreate); \ + __op__(cuEventDestroy); \ + __op__(cuEventQuery); \ + __op__(cuEventRecord); \ + __op__(cuGetErrorName); \ + __op__(cuGetErrorString); \ + __op__(cuInit); \ + __op__(cuLaunchKernel); \ + __op__(cuMemAllocManaged); \ + __op__(cuMemAlloc); \ + __op__(cuMemcpy2DAsync); \ + __op__(cuMemcpy3DAsync); \ + __op__(cuMemcpyAsync); \ + __op__(cuMemcpyDtoDAsync); \ + __op__(cuMemcpyDtoHAsync); \ + __op__(cuMemcpyHtoDAsync); \ + __op__(cuMemFreeHost); \ + __op__(cuMemFree); \ + __op__(cuMemGetInfo); \ + __op__(cuMemHostAlloc); \ + __op__(cuMemHostGetDevicePointer); \ + __op__(cuMemHostRegister); \ + __op__(cuMemHostUnregister); \ + __op__(cuMemsetD16Async); \ + __op__(cuMemsetD2D16Async); \ + __op__(cuMemsetD2D32Async); \ + __op__(cuMemsetD2D8Async); \ + __op__(cuMemsetD32Async); \ + __op__(cuMemsetD8Async); \ + __op__(cuModuleLoadDataEx); \ + __op__(cuStreamAddCallback); \ + __op__(cuStreamCreate); \ + __op__(cuStreamDestroy); \ + __op__(cuStreamSynchronize); \ + __op__(cuStreamWaitEvent) + + #if CUDA_VERSION >= 11030 + // cuda 11.3+ gives us handy PFN_... types + #define DECL_FNPTR_EXTERN(name) \ + extern PFN_ ## name name ## _fnptr; + #else + // before cuda 11.3, we have to rely on typeof/decltype + #define DECL_FNPTR_EXTERN(name) \ + extern decltype(&name) name ## _fnptr; + #endif + CUDA_DRIVER_APIS(DECL_FNPTR_EXTERN); + #undef DECL_FNPTR_EXTERN + + #define CUDA_RUNTIME_APIS_PRE_11_0(__op__) \ + __op__(cudaGetDevice, cudaError_t, (int *)); \ + __op__(cudaGetErrorString, const char *, (cudaError_t)); \ + __op__(cudaSetDevice, cudaError_t, (int)); \ + __op__(cudaLaunchKernel, cudaError_t, (const void *, dim3, dim3, void **, size_t, cudaStream_t)) + #if CUDA_VERSION >= 11000 + #define CUDA_RUNTIME_APIS_11_0(__op__) \ + __op__(cudaGetFuncBySymbol, cudaError_t, (cudaFunction_t *, const void *)); + #else + #define CUDA_RUNTIME_APIS_11_0(__op__) + #endif + + #define CUDA_RUNTIME_APIS(__op__) \ + CUDA_RUNTIME_APIS_11_0(__op__) \ + CUDA_RUNTIME_APIS_PRE_11_0(__op__) + + #define DECL_FNPTR_EXTERN(name, retval, params) \ + extern retval (*name ## _fnptr) params; + CUDA_RUNTIME_APIS(DECL_FNPTR_EXTERN); + #undef DECL_FNPTR_EXTERN + +#else + #define CUDA_DRIVER_FNPTR(name) (name) + #define CUDA_RUNTIME_FNPTR(name) (name) +#endif + }; // namespace Cuda }; // namespace Realm diff --git a/runtime/realm/cuda/cuda_module.cc b/runtime/realm/cuda/cuda_module.cc index e2e56b883a..a868569901 100644 --- a/runtime/realm/cuda/cuda_module.cc +++ b/runtime/realm/cuda/cuda_module.cc @@ -29,6 +29,17 @@ #include "realm/cuda/cudart_hijack.h" #endif +#ifdef REALM_CUDA_DYNAMIC_LOAD + #ifdef REALM_USE_DLFCN + #include + #else + #error dynamic loading of CUDA driver/runtime requires use of dlfcn! + #endif + #ifdef REALM_USE_CUDART_HIJACK + #error REALM_CUDA_DYNAMIC_LOAD and REALM_USE_CUDART_HIJACK both enabled! + #endif +#endif + #include "realm/mutex.h" #include "realm/utils.h" @@ -42,6 +53,17 @@ #define IS_DEFAULT_STREAM(stream) \ (((stream) == 0) || ((stream) == cudaStreamLegacy) || ((stream) == cudaStreamPerThread)) +#ifdef REALM_CUDA_DYNAMIC_LOAD +// the cuda runtime might be statically linked, in which case we want to use +// those symbols instead of dlopen'ing a fresh libcudart.so +extern "C" { +#define WEAK_RUNTIME_DECL(name, retval, params) \ + retval name params __attribute__((weak)); + CUDA_RUNTIME_APIS(WEAK_RUNTIME_DECL); + #undef WEAK_RUNTIME_DECL +}; +#endif + namespace Realm { namespace Cuda { @@ -54,6 +76,27 @@ namespace Realm { #endif Logger log_stream("gpustream"); +#ifdef REALM_CUDA_DYNAMIC_LOAD + bool cuda_api_fnptrs_loaded = false; + + #if CUDA_VERSION >= 11030 + // cuda 11.3+ gives us handy PFN_... types + #define DEFINE_FNPTR(name) \ + PFN_ ## name name ## _fnptr = 0; + #else + // before cuda 11.3, we have to rely on typeof/decltype + #define DEFINE_FNPTR(name) \ + decltype(&name) name ## _fnptr = 0; + #endif + CUDA_DRIVER_APIS(DEFINE_FNPTR); + #undef DEFINE_FNPTR + + #define DEFINE_FNPTR(name, retval, params) \ + retval (*name ## _fnptr) params = 0; + CUDA_RUNTIME_APIS(DEFINE_FNPTR); + #undef DEFINE_FNPTR +#endif + //////////////////////////////////////////////////////////////////////// // // class GPUStream @@ -62,7 +105,7 @@ namespace Realm { : gpu(_gpu), worker(_worker), issuing_copies(false) { assert(worker != 0); - CHECK_CU( cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuStreamCreate)(&stream, CU_STREAM_NON_BLOCKING) ); log_stream.info() << "CUDA stream " << stream << " created for GPU " << gpu; } @@ -71,7 +114,7 @@ namespace Realm { // log_stream.info() << "CUDA stream " << stream << " destroyed - max copies = " // << pending_copies.capacity() << ", max events = " << pending_events.capacity(); - CHECK_CU( cuStreamDestroy(stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuStreamDestroy)(stream) ); } GPU *GPUStream::get_gpu(void) const @@ -109,7 +152,7 @@ namespace Realm { { CUevent e = gpu->event_pool.get_event(); - CHECK_CU( cuEventRecord(e, stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuEventRecord)(e, stream) ); log_stream.debug() << "CUDA fence event " << e << " recorded on stream " << stream << " (GPU " << gpu << ")"; @@ -120,7 +163,7 @@ namespace Realm { { CUevent e = gpu->event_pool.get_event(); - CHECK_CU( cuEventRecord(e, stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuEventRecord)(e, stream) ); log_stream.debug() << "CUDA start event " << e << " recorded on stream " << stream << " (GPU " << gpu << ")"; @@ -132,7 +175,7 @@ namespace Realm { { CUevent e = gpu->event_pool.get_event(); - CHECK_CU( cuEventRecord(e, stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuEventRecord)(e, stream) ); add_event(e, 0, notification); } @@ -174,12 +217,12 @@ namespace Realm { continue; CUevent e = gpu->event_pool.get_event(); - CHECK_CU( cuEventRecord(e, (*it)->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuEventRecord)(e, (*it)->get_stream()) ); log_stream.debug() << "CUDA stream " << stream << " waiting on stream " << (*it)->get_stream() << " (GPU " << gpu << ")"; - CHECK_CU( cuStreamWaitEvent(stream, e, 0) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuStreamWaitEvent)(stream, e, 0) ); // record this event on our stream add_event(e, 0); @@ -276,7 +319,7 @@ namespace Realm { // we'll keep looking at events until we find one that hasn't triggered bool work_left = true; while(event_valid) { - CUresult res = cuEventQuery(event); + CUresult res = CUDA_DRIVER_FNPTR(cuEventQuery)(event); if(res == CUDA_ERROR_NOT_READY) return true; // oldest event hasn't triggered - check again later @@ -285,8 +328,8 @@ namespace Realm { if(res != CUDA_SUCCESS) { const char *ename = 0; const char *estr = 0; - cuGetErrorName(res, &ename); - cuGetErrorString(res, &estr); + CUDA_DRIVER_FNPTR(cuGetErrorName)(res, &ename); + CUDA_DRIVER_FNPTR(cuGetErrorString)(res, &estr); log_gpu.fatal() << "CUDA error reported on GPU " << gpu->info->index << ": " << estr << " (" << ename << ")"; assert(0); } @@ -374,18 +417,20 @@ namespace Realm { { case GPU_MEMCPY_HOST_TO_DEVICE: { - CHECK_CU( cuMemcpyHtoDAsync((CUdeviceptr)(((char*)dst)+span_start), - (((char*)src)+span_start), - span_bytes, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpyHtoDAsync) + ((CUdeviceptr)(((char*)dst)+span_start), + (((char*)src)+span_start), + span_bytes, + raw_stream) ); break; } case GPU_MEMCPY_DEVICE_TO_HOST: { - CHECK_CU( cuMemcpyDtoHAsync((((char*)dst)+span_start), - (CUdeviceptr)(((char*)src)+span_start), - span_bytes, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpyDtoHAsync) + ((((char*)dst)+span_start), + (CUdeviceptr)(((char*)src)+span_start), + span_bytes, + raw_stream) ); #ifdef REALM_USE_VALGRIND_ANNOTATIONS VALGRIND_MAKE_MEM_DEFINED((((char*)dst)+span_start), span_bytes); #endif @@ -394,10 +439,11 @@ namespace Realm { case GPU_MEMCPY_DEVICE_TO_DEVICE: case GPU_MEMCPY_PEER_TO_PEER: { - CHECK_CU( cuMemcpyDtoDAsync((CUdeviceptr)(((char*)dst)+span_start), - (CUdeviceptr)(((char*)src)+span_start), - span_bytes, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpyDtoDAsync) + ((CUdeviceptr)(((char*)dst)+span_start), + (CUdeviceptr)(((char*)src)+span_start), + span_bytes, + raw_stream) ); break; } default: @@ -468,7 +514,8 @@ namespace Realm { copy_info.dstXInBytes = 0; copy_info.WidthInBytes = bytes; copy_info.Height = lines; - CHECK_CU( cuMemcpy2DAsync(©_info, stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpy2DAsync) + (©_info, stream->get_stream()) ); if(notification) stream->add_notification(notification); @@ -541,7 +588,8 @@ namespace Realm { copy_info.WidthInBytes = bytes; copy_info.Height = height; copy_info.Depth = depth; - CHECK_CU( cuMemcpy3DAsync(©_info, stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpy3DAsync) + (©_info, stream->get_stream()) ); } else { // we can unroll either lines (height) or planes (depth) - choose the // smaller of the two to minimize API calls @@ -589,7 +637,8 @@ namespace Realm { copy_info.Height = lines_2d; for(size_t i = 0; i < count; i++) { - CHECK_CU( cuMemcpy2DAsync(©_info, stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpy2DAsync) + (©_info, stream->get_stream()) ); copy_info.srcDevice += src_delta; copy_info.srcHost = reinterpret_cast(copy_info.srcDevice); copy_info.dstDevice += dst_delta; @@ -650,27 +699,30 @@ namespace Realm { { unsigned char fill_u8; memcpy(&fill_u8, fill_data.direct, 1); - CHECK_CU( cuMemsetD8Async(CUdeviceptr(dst), - fill_u8, bytes, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD8Async) + (CUdeviceptr(dst), + fill_u8, bytes, + raw_stream) ); break; } case 2: { unsigned short fill_u16; memcpy(&fill_u16, fill_data.direct, 2); - CHECK_CU( cuMemsetD16Async(CUdeviceptr(dst), - fill_u16, bytes >> 1, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD16Async) + (CUdeviceptr(dst), + fill_u16, bytes >> 1, + raw_stream) ); break; } case 4: { unsigned int fill_u32; memcpy(&fill_u32, fill_data.direct, 4); - CHECK_CU( cuMemsetD32Async(CUdeviceptr(dst), - fill_u32, bytes >> 2, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD32Async) + (CUdeviceptr(dst), + fill_u32, bytes >> 2, + raw_stream) ); break; } default: @@ -685,31 +737,34 @@ namespace Realm { for(size_t offset = 0; offset < fill_data_size; offset += 4) { unsigned int fill_u32; memcpy(&fill_u32, srcdata + offset, 4); - CHECK_CU( cuMemsetD2D32Async(CUdeviceptr(dst) + offset, - fill_data_size /*pitch*/, - fill_u32, - 1 /*width*/, elements /*height*/, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D32Async) + (CUdeviceptr(dst) + offset, + fill_data_size /*pitch*/, + fill_u32, + 1 /*width*/, elements /*height*/, + raw_stream) ); } } else if((fill_data_size & 1) == 0) { for(size_t offset = 0; offset < fill_data_size; offset += 2) { unsigned short fill_u16; memcpy(&fill_u16, srcdata + offset, 2); - CHECK_CU( cuMemsetD2D16Async(CUdeviceptr(dst) + offset, - fill_data_size /*pitch*/, - fill_u16, - 1 /*width*/, elements /*height*/, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D16Async) + (CUdeviceptr(dst) + offset, + fill_data_size /*pitch*/, + fill_u16, + 1 /*width*/, elements /*height*/, + raw_stream) ); } } else { for(size_t offset = 0; offset < fill_data_size; offset += 1) { unsigned char fill_u8; memcpy(&fill_u8, srcdata + offset, 1); - CHECK_CU( cuMemsetD2D8Async(CUdeviceptr(dst) + offset, - fill_data_size /*pitch*/, - fill_u8, - 1 /*width*/, elements /*height*/, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D8Async) + (CUdeviceptr(dst) + offset, + fill_data_size /*pitch*/, + fill_u8, + 1 /*width*/, elements /*height*/, + raw_stream) ); } } } @@ -765,27 +820,30 @@ namespace Realm { { unsigned char fill_u8; memcpy(&fill_u8, fill_data.direct, 1); - CHECK_CU( cuMemsetD2D8Async(CUdeviceptr(dst), dst_stride, - fill_u8, bytes, lines, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D8Async) + (CUdeviceptr(dst), dst_stride, + fill_u8, bytes, lines, + raw_stream) ); break; } case 2: { unsigned short fill_u16; memcpy(&fill_u16, fill_data.direct, 2); - CHECK_CU( cuMemsetD2D16Async(CUdeviceptr(dst), dst_stride, - fill_u16, bytes >> 1, lines, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D16Async) + (CUdeviceptr(dst), dst_stride, + fill_u16, bytes >> 1, lines, + raw_stream) ); break; } case 4: { unsigned int fill_u32; memcpy(&fill_u32, fill_data.direct, 4); - CHECK_CU( cuMemsetD2D32Async(CUdeviceptr(dst), dst_stride, - fill_u32, bytes >> 2, lines, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D32Async) + (CUdeviceptr(dst), dst_stride, + fill_u32, bytes >> 2, lines, + raw_stream) ); break; } default: @@ -801,33 +859,36 @@ namespace Realm { unsigned int fill_u32; memcpy(&fill_u32, srcdata + offset, 4); for(size_t l = 0; l < lines; l++) - CHECK_CU( cuMemsetD2D32Async(CUdeviceptr(dst) + offset + (l * dst_stride), - fill_data_size /*pitch*/, - fill_u32, - 1 /*width*/, elements /*height*/, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D32Async) + (CUdeviceptr(dst) + offset + (l * dst_stride), + fill_data_size /*pitch*/, + fill_u32, + 1 /*width*/, elements /*height*/, + raw_stream) ); } } else if((fill_data_size & 1) == 0) { for(size_t offset = 0; offset < fill_data_size; offset += 2) { unsigned short fill_u16; memcpy(&fill_u16, srcdata + offset, 2); for(size_t l = 0; l < lines; l++) - CHECK_CU( cuMemsetD2D16Async(CUdeviceptr(dst) + offset + (l * dst_stride), - fill_data_size /*pitch*/, - fill_u16, - 1 /*width*/, elements /*height*/, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D16Async) + (CUdeviceptr(dst) + offset + (l * dst_stride), + fill_data_size /*pitch*/, + fill_u16, + 1 /*width*/, elements /*height*/, + raw_stream) ); } } else { for(size_t offset = 0; offset < fill_data_size; offset += 1) { unsigned char fill_u8; memcpy(&fill_u8, srcdata + offset, 1); for(size_t l = 0; l < lines; l++) - CHECK_CU( cuMemsetD2D8Async(CUdeviceptr(dst) + offset + (l * dst_stride), - fill_data_size /*pitch*/, - fill_u8, - 1 /*width*/, elements /*height*/, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D8Async) + (CUdeviceptr(dst) + offset + (l * dst_stride), + fill_data_size /*pitch*/, + fill_u8, + 1 /*width*/, elements /*height*/, + raw_stream) ); } } } @@ -886,27 +947,30 @@ namespace Realm { { unsigned char fill_u8; memcpy(&fill_u8, fill_data.direct, 1); - CHECK_CU( cuMemsetD2D8Async(CUdeviceptr(dst), dst_stride, - fill_u8, bytes, height, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D8Async) + (CUdeviceptr(dst), dst_stride, + fill_u8, bytes, height, + raw_stream) ); break; } case 2: { unsigned short fill_u16; memcpy(&fill_u16, fill_data.direct, 2); - CHECK_CU( cuMemsetD2D16Async(CUdeviceptr(dst), dst_stride, - fill_u16, bytes >> 1, height, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D16Async) + (CUdeviceptr(dst), dst_stride, + fill_u16, bytes >> 1, height, + raw_stream) ); break; } case 4: { unsigned int fill_u32; memcpy(&fill_u32, fill_data.direct, 4); - CHECK_CU( cuMemsetD2D32Async(CUdeviceptr(dst), dst_stride, - fill_u32, bytes >> 2, height, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D32Async) + (CUdeviceptr(dst), dst_stride, + fill_u32, bytes >> 2, height, + raw_stream) ); break; } default: @@ -922,33 +986,36 @@ namespace Realm { unsigned int fill_u32; memcpy(&fill_u32, srcdata + offset, 4); for(size_t l = 0; l < height; l++) - CHECK_CU( cuMemsetD2D32Async(CUdeviceptr(dst) + offset + (l * dst_stride), - fill_data_size /*pitch*/, - fill_u32, - 1 /*width*/, elements /*height*/, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D32Async) + (CUdeviceptr(dst) + offset + (l * dst_stride), + fill_data_size /*pitch*/, + fill_u32, + 1 /*width*/, elements /*height*/, + raw_stream) ); } } else if((fill_data_size & 1) == 0) { for(size_t offset = 0; offset < fill_data_size; offset += 2) { unsigned short fill_u16; memcpy(&fill_u16, srcdata + offset, 2); for(size_t l = 0; l < height; l++) - CHECK_CU( cuMemsetD2D16Async(CUdeviceptr(dst) + offset + (l * dst_stride), - fill_data_size /*pitch*/, - fill_u16, - 1 /*width*/, elements /*height*/, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D16Async) + (CUdeviceptr(dst) + offset + (l * dst_stride), + fill_data_size /*pitch*/, + fill_u16, + 1 /*width*/, elements /*height*/, + raw_stream) ); } } else { for(size_t offset = 0; offset < fill_data_size; offset += 1) { unsigned char fill_u8; memcpy(&fill_u8, srcdata + offset, 1); for(size_t l = 0; l < height; l++) - CHECK_CU( cuMemsetD2D8Async(CUdeviceptr(dst) + offset + (l * dst_stride), - fill_data_size /*pitch*/, - fill_u8, - 1 /*width*/, elements /*height*/, - raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD2D8Async) + (CUdeviceptr(dst) + offset + (l * dst_stride), + fill_data_size /*pitch*/, + fill_u8, + 1 /*width*/, elements /*height*/, + raw_stream) ); } } } @@ -983,7 +1050,8 @@ namespace Realm { copy_info.dstDevice = ((CUdeviceptr)dst + (done * dst_pstride)); copy_info.Depth = todo; - CHECK_CU( cuMemcpy3DAsync(©_info, raw_stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpy3DAsync) + (©_info, raw_stream) ); } } @@ -1080,7 +1148,7 @@ namespace Realm { void GPUWorkFence::enqueue_on_stream(GPUStream *stream) { if(stream->get_gpu()->module->cfg_fences_use_callbacks) { - CHECK_CU( cuStreamAddCallback(stream->get_stream(), &cuda_callback, (void *)this, 0) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuStreamAddCallback)(stream->get_stream(), &cuda_callback, (void *)this, 0) ); } else { stream->add_fence(this); } @@ -1110,7 +1178,7 @@ namespace Realm { void GPUWorkStart::enqueue_on_stream(GPUStream *stream) { if(stream->get_gpu()->module->cfg_fences_use_callbacks) { - CHECK_CU( cuStreamAddCallback(stream->get_stream(), &cuda_start_callback, (void *)this, 0) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuStreamAddCallback)(stream->get_stream(), &cuda_start_callback, (void *)this, 0) ); } else { stream->add_start_event(this); } @@ -1146,7 +1214,7 @@ namespace Realm { //log_stream.info() << "gpu memcpy fence " << this << " (fence = " << fence << ") executed"; fence->enqueue_on_stream(stream); #ifdef FORCE_GPU_STREAM_SYNCHRONIZE - CHECK_CU( cuStreamSynchronize(stream->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuStreamSynchronize)(stream->get_stream()) ); #endif } @@ -1179,7 +1247,7 @@ namespace Realm { // TODO: measure how much benefit is derived from CU_EVENT_DISABLE_TIMING and // consider using them for completion callbacks for(int i = 0; i < init_size; i++) - CHECK_CU( cuEventCreate(&available_events[i], CU_EVENT_DEFAULT) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuEventCreate)(&available_events[i], CU_EVENT_DEFAULT) ); } void GPUEventPool::empty_pool(void) @@ -1190,7 +1258,7 @@ namespace Realm { log_stream.warning() << "Application leaking " << external_count << " cuda events"; for(int i = 0; i < current_size; i++) - CHECK_CU( cuEventDestroy(available_events[i]) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuEventDestroy)(available_events[i]) ); current_size = 0; total_size = 0; @@ -1214,9 +1282,9 @@ namespace Realm { available_events.resize(total_size); for(int i = 0; i < batch_size; i++) - CHECK_CU( cuEventCreate(&available_events[i], CU_EVENT_DEFAULT) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuEventCreate)(&available_events[i], CU_EVENT_DEFAULT) ); } - + if(external) external_count++; @@ -1378,14 +1446,14 @@ namespace Realm { { AutoGPUContext agc(gpu); - CUresult res = cuCtxSynchronize(); + CUresult res = CUDA_DRIVER_FNPTR(cuCtxSynchronize)(); // complain loudly about any errors if(res != CUDA_SUCCESS) { const char *ename = 0; const char *estr = 0; - cuGetErrorName(res, &ename); - cuGetErrorString(res, &estr); + CUDA_DRIVER_FNPTR(cuGetErrorName)(res, &ename); + CUDA_DRIVER_FNPTR(cuGetErrorString)(res, &estr); log_gpu.fatal() << "CUDA error reported on GPU " << gpu->info->index << ": " << estr << " (" << ename << ")"; abort(); } @@ -1535,7 +1603,7 @@ namespace Realm { // A useful debugging macro #ifdef FORCE_GPU_STREAM_SYNCHRONIZE - CHECK_CU( cuStreamSynchronize(s->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuStreamSynchronize)(s->get_stream()) ); #endif // pop the CUDA context for this GPU back off @@ -1584,7 +1652,7 @@ namespace Realm { } // we didn't use streams here, so synchronize the whole context - CHECK_CU( cuCtxSynchronize() ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuCtxSynchronize)() ); gpu_proc->block_on_synchronize = false; // pop the CUDA context for this GPU back off @@ -1947,7 +2015,7 @@ namespace Realm { { AutoGPUContext agc(gpu); - CHECK_CU( cuCtxSynchronize() ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuCtxSynchronize)() ); } } @@ -2289,10 +2357,10 @@ namespace Realm { void GPUProcessor::stream_wait_on_event(cudaStream_t stream, cudaEvent_t event) { if(IS_DEFAULT_STREAM(stream)) - CHECK_CU( cuStreamWaitEvent( + CHECK_CU( CUDA_DRIVER_FNPTR(cuStreamWaitEvent)( ThreadLocal::current_gpu_stream->get_stream(), event, 0) ); else - CHECK_CU( cuStreamWaitEvent(stream, event, 0) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuStreamWaitEvent)(stream, event, 0) ); } void GPUProcessor::stream_synchronize(cudaStream_t stream) @@ -2319,11 +2387,11 @@ namespace Realm { << stream << " that Realm did not create which suggests " << "that there is another copy of the CUDA runtime " << "somewhere making its own streams... be VERY careful."; - CHECK_CU( cuStreamSynchronize(stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuStreamSynchronize)(stream) ); } } else { // oh well... - CHECK_CU( cuStreamSynchronize(stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuStreamSynchronize)(stream) ); } } else @@ -2376,7 +2444,7 @@ namespace Realm { waiter.preempt(); } else { // oh well... - CHECK_CU( cuStreamSynchronize(current->get_stream()) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuStreamSynchronize)(current->get_stream()) ); } } @@ -2407,14 +2475,14 @@ namespace Realm { CUevent e = event; if(IS_DEFAULT_STREAM(stream)) stream = ThreadLocal::current_gpu_stream->get_stream(); - CHECK_CU( cuEventRecord(e, stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuEventRecord)(e, stream) ); } void GPUProcessor::event_synchronize(cudaEvent_t event) { // TODO: consider suspending task rather than busy-waiting here... CUevent e = event; - CHECK_CU( cuEventSynchronize(e) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuEventSynchronize)(e) ); } void GPUProcessor::event_elapsed_time(float *ms, cudaEvent_t start, cudaEvent_t end) @@ -2422,7 +2490,7 @@ namespace Realm { // TODO: consider suspending task rather than busy-waiting here... CUevent e1 = start; CUevent e2 = end; - CHECK_CU( cuEventElapsedTime(ms, e1, e2) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuEventElapsedTime)(ms, e1, e2) ); } GPUProcessor::LaunchConfig::LaunchConfig(dim3 _grid, dim3 _block, size_t _shared) @@ -2532,7 +2600,8 @@ namespace Realm { CUstream current = ThreadLocal::current_gpu_stream->get_stream(); // the synchronous copy still uses cuMemcpyAsync so that we can limit the // synchronization to just the right stream - CHECK_CU( cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, size, current) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpyAsync) + ((CUdeviceptr)dst, (CUdeviceptr)src, size, current) ); stream_synchronize(current); } @@ -2541,7 +2610,8 @@ namespace Realm { { if(IS_DEFAULT_STREAM(stream)) stream = ThreadLocal::current_gpu_stream->get_stream(); - CHECK_CU( cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, size, stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpyAsync) + ((CUdeviceptr)dst, (CUdeviceptr)src, size, stream) ); // no synchronization here } @@ -2572,7 +2642,8 @@ namespace Realm { copy_info.Height = height; // the synchronous copy still uses cuMemcpyAsync so that we can limit the // synchronization to just the right stream - CHECK_CU( cuMemcpy2DAsync(©_info, current) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpy2DAsync) + (©_info, current) ); stream_synchronize(current); } @@ -2601,7 +2672,8 @@ namespace Realm { copy_info.dstXInBytes = 0; copy_info.WidthInBytes = width; copy_info.Height = height; - CHECK_CU( cuMemcpy2DAsync(©_info, stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpy2DAsync) + (©_info, stream) ); // no synchronization here } @@ -2611,8 +2683,9 @@ namespace Realm { { CUstream current = ThreadLocal::current_gpu_stream->get_stream(); CUdeviceptr var_base = gpu->lookup_variable(dst); - CHECK_CU( cuMemcpyAsync(var_base + offset, - (CUdeviceptr)src, size, current) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpyAsync) + (var_base + offset, + (CUdeviceptr)src, size, current) ); stream_synchronize(current); } @@ -2623,8 +2696,9 @@ namespace Realm { if(IS_DEFAULT_STREAM(stream)) stream = ThreadLocal::current_gpu_stream->get_stream(); CUdeviceptr var_base = gpu->lookup_variable(dst); - CHECK_CU( cuMemcpyAsync(var_base + offset, - (CUdeviceptr)src, size, stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpyAsync) + (var_base + offset, + (CUdeviceptr)src, size, stream) ); // no synchronization here } @@ -2634,9 +2708,10 @@ namespace Realm { { CUstream current = ThreadLocal::current_gpu_stream->get_stream(); CUdeviceptr var_base = gpu->lookup_variable(src); - CHECK_CU( cuMemcpyAsync((CUdeviceptr)dst, - var_base + offset, - size, current) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpyAsync) + ((CUdeviceptr)dst, + var_base + offset, + size, current) ); stream_synchronize(current); } @@ -2647,9 +2722,10 @@ namespace Realm { if(IS_DEFAULT_STREAM(stream)) stream = ThreadLocal::current_gpu_stream->get_stream(); CUdeviceptr var_base = gpu->lookup_variable(src); - CHECK_CU( cuMemcpyAsync((CUdeviceptr)dst, - var_base + offset, - size, stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemcpyAsync) + ((CUdeviceptr)dst, + var_base + offset, + size, stream) ); // no synchronization here } #endif @@ -2657,8 +2733,9 @@ namespace Realm { void GPUProcessor::gpu_memset(void *dst, int value, size_t count) { CUstream current = ThreadLocal::current_gpu_stream->get_stream(); - CHECK_CU( cuMemsetD8Async((CUdeviceptr)dst, (unsigned char)value, - count, current) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD8Async) + ((CUdeviceptr)dst, (unsigned char)value, + count, current) ); } void GPUProcessor::gpu_memset_async(void *dst, int value, @@ -2666,8 +2743,9 @@ namespace Realm { { if(IS_DEFAULT_STREAM(stream)) stream = ThreadLocal::current_gpu_stream->get_stream(); - CHECK_CU( cuMemsetD8Async((CUdeviceptr)dst, (unsigned char)value, - count, stream) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemsetD8Async) + ((CUdeviceptr)dst, (unsigned char)value, + count, stream) ); } @@ -2733,21 +2811,21 @@ namespace Realm { // free memory if(fbmem_base) - CHECK_CU( cuMemFree(fbmem_base) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemFree)(fbmem_base) ); - CHECK_CU( cuDevicePrimaryCtxRelease(info->device) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuDevicePrimaryCtxRelease)(info->device) ); } void GPU::push_context(void) { - CHECK_CU( cuCtxPushCurrent(context) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuCtxPushCurrent)(context) ); } void GPU::pop_context(void) { // the context we pop had better be ours... CUcontext popped; - CHECK_CU( cuCtxPopCurrent(&popped) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuCtxPopCurrent)(&popped) ); assert(popped == context); } @@ -2816,7 +2894,7 @@ namespace Realm { { AutoGPUContext agc(this); - CUresult ret = cuCtxEnablePeerAccess((*it)->context, 0); + CUresult ret = CUDA_DRIVER_FNPTR(cuCtxEnablePeerAccess)((*it)->context, 0); if((ret != CUDA_SUCCESS) && (ret != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED)) REPORT_CU_ERROR("cuCtxEnablePeerAccess((*it)->context, 0)", ret); @@ -2841,18 +2919,19 @@ namespace Realm { { AutoGPUContext agc(this); - CUresult ret = cuMemAlloc(&fbmem_base, size); + CUresult ret = CUDA_DRIVER_FNPTR(cuMemAlloc)(&fbmem_base, size); if(ret != CUDA_SUCCESS) { if(ret == CUDA_ERROR_OUT_OF_MEMORY) { size_t free_bytes, total_bytes; - CHECK_CU( cuMemGetInfo(&free_bytes, &total_bytes) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemGetInfo) + (&free_bytes, &total_bytes) ); log_gpu.fatal() << "insufficient memory on gpu " << info->index << ": " << size << " bytes needed (from -ll:fsize), " << free_bytes << " (out of " << total_bytes << ") available"; } else { const char *errstring = "error message not available"; #if CUDA_VERSION >= 6050 - cuGetErrorName(ret, &errstring); + CUDA_DRIVER_FNPTR(cuGetErrorName)(ret, &errstring); #endif log_gpu.fatal() << "unexpected error from cuMemAlloc on gpu " << info->index << ": result=" << ret @@ -3010,8 +3089,11 @@ namespace Realm { option_vals[2] = log_error_buffer; option_vals[3] = (void*)buffer_size; CUmodule module; - CUresult result = cuModuleLoadDataEx(&module, data, num_options, - jit_options, option_vals); + CUresult result = CUDA_DRIVER_FNPTR(cuModuleLoadDataEx)(&module, + data, + num_options, + jit_options, + option_vals); if (result != CUDA_SUCCESS) { #ifdef REALM_ON_MACOS @@ -3031,8 +3113,8 @@ namespace Realm { log_error_buffer); #if CUDA_VERSION >= 6050 const char *name, *str; - CHECK_CU( cuGetErrorName(result, &name) ); - CHECK_CU( cuGetErrorString(result, &str) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuGetErrorName)(result, &name) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuGetErrorString)(result, &str) ); fprintf(stderr,"CU: cuModuleLoadDataEx = %d (%s): %s\n", result, name, str); #else @@ -3106,6 +3188,124 @@ namespace Realm { delete_container_contents(gpu_info); } +#ifdef REALM_CUDA_DYNAMIC_LOAD + static bool resolve_cuda_api_fnptrs(bool required) + { + if(cuda_api_fnptrs_loaded) + return true; + + // driver symbols have to come from a dynamic libcuda +#ifdef REALM_USE_DLFCN + log_gpu.info() << "dynamically loading libcuda.so"; + void *libcuda = dlopen("libcuda.so", RTLD_NOW); + if(!libcuda) { + if(required) { + log_gpu.fatal() << "could not open libcuda.so: " << strerror(errno); + abort(); + } else { + log_gpu.info() << "could not open libcuda.so: " << strerror(errno); + return false; + } + } +#if CUDA_VERSION >= 11030 + // cuda 11.3+ provides cuGetProcAddress to handle versioning nicely + PFN_cuGetProcAddress driver_getproc = 0; + { + void *sym = dlsym(libcuda, "cuGetProcAddress"); + if(sym) { + driver_getproc = reinterpret_cast(sym); + } else { + if(required) { + log_gpu.fatal() << "symbol 'cuGetProcAddress' not found in libcuda.so'"; + abort(); + } else { + log_gpu.info() << "symbol 'cuGetProcAddress' not found in libcuda.so'"; + } + } + } +#else + // before cuda 11.3, we have to dlsym things, but rely on cuda.h's + // compile-time translation to versioned function names +#define STRINGIFY(s) #s +#define DRIVER_GET_FNPTR(name) \ + do { \ + void *sym = dlsym(libcuda, STRINGIFY(name)); \ + if(!sym) { \ + log_gpu.fatal() << "symbol '" STRINGIFY(name) " missing from libcuda.so!"; \ + abort(); \ + } \ + name ## _fnptr = reinterpret_cast(sym); \ + } while(0) + CUDA_DRIVER_APIS(DRIVER_GET_FNPTR); +#undef DRIVER_GET_FNPTR +#undef STRINGIFY +#endif +#endif +#if CUDA_VERSION >= 11030 +#define DRIVER_GET_FNPTR(name) \ + CHECK_CU( (driver_getproc)(#name, (void **)&name ## _fnptr, \ + CUDA_VERSION, CU_GET_PROC_ADDRESS_DEFAULT) ); + CUDA_DRIVER_APIS(DRIVER_GET_FNPTR); +#undef DRIVER_GET_FNPTR +#endif + + // see if we've been statically linked against libcudart_static.a + if(cudaGetDevice) { + log_gpu.info() << "using statically linked libcudart"; +#define RUNTIME_STATIC_FNPTR(name, retval, params) \ + if(static_cast(name)) { \ + name ## _fnptr = name; \ + } else { \ + log_gpu.fatal() << "static cudart linkage missing symbol '" #name "'!"; \ + abort(); \ + } + CUDA_RUNTIME_APIS(RUNTIME_STATIC_FNPTR); +#undef RUNTIME_STATIC_FNPTR + } else { + log_gpu.info() << "dynamically loading libcudart.so"; + void *libcudart = dlopen("libcudart.so", RTLD_NOW); + if(libcudart) { + // sanity-check which version of the runtime we loaded + { + void *sym = dlsym(libcudart, "cudaRuntimeGetVersion"); + if(sym) { + cudaError_t (*runtime_version_fnptr)(int *); + runtime_version_fnptr = reinterpret_cast(sym); + int loaded_runtime_version = 0; + CHECK_CUDART( runtime_version_fnptr(&loaded_runtime_version) ); + if(loaded_runtime_version < CUDA_VERSION) + log_gpu.error() << "CUDA runtime version mismatch - expected " << CUDA_VERSION << ", got " << loaded_runtime_version << " - consider adjusting LD_LIBRARY_PATH"; + } else { + log_gpu.fatal() << "symbol 'cudaRuntimeGetVersion' missing from libcudart.so!"; + abort(); + } + } +#define RUNTIME_GET_FNPTR(name, retval, params) \ + do { \ + void *sym = dlsym(libcudart, #name); \ + if(!sym) { \ + log_gpu.fatal() << "symbol '" #name "' missing from libcudart.so!"; \ + abort(); \ + } \ + name ## _fnptr = reinterpret_cast(sym); \ + } while(0) + CUDA_RUNTIME_APIS(RUNTIME_GET_FNPTR); +#undef RUNTIME_GET_FNPTR + } else { + if(required) { + log_gpu.fatal() << "could not open libcudart.so: " << strerror(errno); + abort(); + } else { + log_gpu.info() << "could not open libcudart.so: " << strerror(errno); + return false; + } + } + } + + return true; + } +#endif + /*static*/ Module *CudaModule::create_module(RuntimeImpl *runtime, std::vector& cmdline) { @@ -3144,17 +3344,25 @@ namespace Realm { } } +#ifdef REALM_CUDA_DYNAMIC_LOAD + if(!resolve_cuda_api_fnptrs(m->cfg_num_gpus > 0)) { + // warning was printed in resolve function + delete m; + return 0; + } +#endif + std::vector infos; { int num_devices; - CUresult ret = cuInit(0); + CUresult ret = CUDA_DRIVER_FNPTR(cuInit)(0); if(ret != CUDA_SUCCESS) { // failure to initialize the driver is a fatal error if we know gpus // have been requested if(m->cfg_num_gpus > 0) { const char *err_name, *err_str; - cuGetErrorName(ret, &err_name); - cuGetErrorString(ret, &err_str); + CUDA_DRIVER_FNPTR(cuGetErrorName)(ret, &err_name); + CUDA_DRIVER_FNPTR(cuGetErrorString)(ret, &err_str); log_gpu.fatal() << "gpus requested, but cuInit(0) returned " << ret << " (" << err_name << "): " << err_str; abort(); @@ -3167,24 +3375,30 @@ namespace Realm { return 0; } } else { - CHECK_CU( cuDeviceGetCount(&num_devices) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuDeviceGetCount)(&num_devices) ); for(int i = 0; i < num_devices; i++) { GPUInfo *info = new GPUInfo; info->index = i; - CHECK_CU( cuDeviceGet(&info->device, i) ); - CHECK_CU( cuDeviceGetName(info->name, sizeof(info->name), info->device) ); - CHECK_CU( cuDeviceGetAttribute(&info->major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info->device) ); - CHECK_CU( cuDeviceGetAttribute(&info->minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, info->device) ); - CHECK_CU( cuDeviceTotalMem(&info->totalGlobalMem, info->device) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuDeviceGet)(&info->device, i) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuDeviceGetName)(info->name, sizeof(info->name), info->device) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuDeviceGetAttribute) + (&info->major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + info->device) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuDeviceGetAttribute) + (&info->minor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + info->device) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuDeviceTotalMem) + (&info->totalGlobalMem, info->device) ); #ifdef REALM_USE_CUDART_HIJACK // We only need the rest of these properties for the hijack #define GET_DEVICE_PROP(member, name) \ do { \ int tmp; \ - CHECK_CU( cuDeviceGetAttribute(&tmp, CU_DEVICE_ATTRIBUTE_##name, info->device) ); \ + CHECK_CU( CUDA_DRIVER_FNPTR(cuDeviceGetAttribute) \ + (&tmp, CU_DEVICE_ATTRIBUTE_##name, info->device) ); \ info->member = tmp; \ } while(0) // SCREW TEXTURES AND SURFACES FOR NOW! @@ -3266,9 +3480,9 @@ namespace Realm { it2++) if(it1 != it2) { int can_access; - CHECK_CU( cuDeviceCanAccessPeer(&can_access, - (*it1)->device, - (*it2)->device) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuDeviceCanAccessPeer)(&can_access, + (*it1)->device, + (*it2)->device) ); if(can_access) { log_gpu.info() << "p2p access from device " << (*it1)->index << " to device " << (*it2)->index; @@ -3317,13 +3531,13 @@ namespace Realm { if(cfg_lmem_resize_to_max) flags |= CU_CTX_LMEM_RESIZE_TO_MAX; - CUresult res = cuDevicePrimaryCtxSetFlags(gpu_info[i]->device, flags); + CUresult res = CUDA_DRIVER_FNPTR(cuDevicePrimaryCtxSetFlags)(gpu_info[i]->device, flags); if(res != CUDA_SUCCESS) { bool lmem_ok; if(res == CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE) { if(cfg_lmem_resize_to_max) { unsigned act_flags = 0; - CHECK_CU( cuCtxGetFlags(&act_flags) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuCtxGetFlags)(&act_flags) ); lmem_ok = ((act_flags & CU_CTX_LMEM_RESIZE_TO_MAX) != 0); } else lmem_ok = true; @@ -3336,8 +3550,8 @@ namespace Realm { } CUcontext context; - CUresult res = cuDevicePrimaryCtxRetain(&context, - gpu_info[i]->device); + CUresult res = CUDA_DRIVER_FNPTR(cuDevicePrimaryCtxRetain)(&context, + gpu_info[i]->device); // a busy GPU might return INVALID_DEVICE or OUT_OF_MEMORY here if((res == CUDA_ERROR_INVALID_DEVICE) || (res == CUDA_ERROR_OUT_OF_MEMORY)) { @@ -3354,10 +3568,10 @@ namespace Realm { if(cfg_min_avail_mem > 0) { size_t total_mem, avail_mem; - CHECK_CU( cuMemGetInfo(&avail_mem, &total_mem) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemGetInfo)(&avail_mem, &total_mem) ); if(avail_mem < cfg_min_avail_mem) { log_gpu.info() << "GPU " << gpu_info[i]->device << " does not have enough available memory (" << avail_mem << " < " << cfg_min_avail_mem << ") - skipping"; - CHECK_CU( cuDevicePrimaryCtxRelease(gpu_info[i]->device) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuDevicePrimaryCtxRelease)(gpu_info[i]->device) ); continue; } } @@ -3411,9 +3625,9 @@ namespace Realm { { AutoGPUContext agc(gpus[0]); - CUresult ret = cuMemHostAlloc(&zcmem_cpu_base, - cfg_zc_mem_size, - CU_MEMHOSTALLOC_PORTABLE | CU_MEMHOSTALLOC_DEVICEMAP); + CUresult ret = CUDA_DRIVER_FNPTR(cuMemHostAlloc)(&zcmem_cpu_base, + cfg_zc_mem_size, + CU_MEMHOSTALLOC_PORTABLE | CU_MEMHOSTALLOC_DEVICEMAP); if(ret != CUDA_SUCCESS) { if(ret == CUDA_ERROR_OUT_OF_MEMORY) { log_gpu.fatal() << "insufficient device-mappable host memory: " @@ -3421,16 +3635,17 @@ namespace Realm { } else { const char *errstring = "error message not available"; #if CUDA_VERSION >= 6050 - cuGetErrorName(ret, &errstring); + CUDA_DRIVER_FNPTR(cuGetErrorName)(ret, &errstring); #endif log_gpu.fatal() << "unexpected error from cuMemHostAlloc: result=" << ret << " (" << errstring << ")"; } abort(); } - CHECK_CU( cuMemHostGetDevicePointer(&zcmem_gpu_base, - zcmem_cpu_base, - 0) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemHostGetDevicePointer) + (&zcmem_gpu_base, + zcmem_cpu_base, + 0) ); // right now there are asssumptions in several places that unified addressing keeps // the CPU and GPU addresses the same assert(zcmem_cpu_base == (void *)zcmem_gpu_base); @@ -3448,7 +3663,9 @@ namespace Realm { CUresult ret; { AutoGPUContext agc(gpus[i]); - ret = cuMemHostGetDevicePointer(&gpuptr, zcmem_cpu_base, 0); + ret = CUDA_DRIVER_FNPTR(cuMemHostGetDevicePointer)(&gpuptr, + zcmem_cpu_base, + 0); } if((ret == CUDA_SUCCESS) && (gpuptr == zcmem_gpu_base)) { gpus[i]->pinned_sysmems.insert(zcmem->me); @@ -3463,11 +3680,12 @@ namespace Realm { CUdeviceptr zcib_gpu_base; { AutoGPUContext agc(gpus[0]); - CHECK_CU( cuMemHostAlloc(&zcib_cpu_base, - cfg_zc_ib_size, - CU_MEMHOSTALLOC_PORTABLE | CU_MEMHOSTALLOC_DEVICEMAP) ); - CHECK_CU( cuMemHostGetDevicePointer(&zcib_gpu_base, - zcib_cpu_base, 0) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemHostAlloc) + (&zcib_cpu_base, + cfg_zc_ib_size, + CU_MEMHOSTALLOC_PORTABLE | CU_MEMHOSTALLOC_DEVICEMAP) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemHostGetDevicePointer) + (&zcib_gpu_base, zcib_cpu_base, 0) ); // right now there are asssumptions in several places that unified addressing keeps // the CPU and GPU addresses the same assert(zcib_cpu_base == (void *)zcib_gpu_base); @@ -3484,7 +3702,9 @@ namespace Realm { CUresult ret; { AutoGPUContext agc(gpus[i]); - ret = cuMemHostGetDevicePointer(&gpuptr, zcib_cpu_base, 0); + ret = CUDA_DRIVER_FNPTR(cuMemHostGetDevicePointer)(&gpuptr, + zcib_cpu_base, + 0); } if ((ret == CUDA_SUCCESS) && (gpuptr == zcib_gpu_base)) { gpus[i]->pinned_sysmems.insert(ib_mem->me); @@ -3502,8 +3722,9 @@ namespace Realm { { AutoGPUContext agc(gpus[0]); - CUresult ret = cuMemAllocManaged(&uvm_gpu_base, cfg_uvm_mem_size, - CU_MEM_ATTACH_GLOBAL); + CUresult ret = CUDA_DRIVER_FNPTR(cuMemAllocManaged)(&uvm_gpu_base, + cfg_uvm_mem_size, + CU_MEM_ATTACH_GLOBAL); if(ret != CUDA_SUCCESS) { if(ret == CUDA_ERROR_OUT_OF_MEMORY) { log_gpu.fatal() << "unable to allocate managed memory: " @@ -3511,7 +3732,7 @@ namespace Realm { } else { const char *errstring = "error message not available"; #if CUDA_VERSION >= 6050 - cuGetErrorName(ret, &errstring); + CUDA_DRIVER_FNPTR(cuGetErrorName)(ret, &errstring); #endif log_gpu.fatal() << "unexpected error from cuMemAllocManaged: result=" << ret << " (" << errstring << ")"; @@ -3533,9 +3754,10 @@ namespace Realm { int concurrent_access; { AutoGPUContext agc(gpus[i]); - CHECK_CU( cuDeviceGetAttribute(&concurrent_access, - CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, - gpus[i]->info->device) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuDeviceGetAttribute) + (&concurrent_access, + CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, + gpus[i]->info->device) ); } if(concurrent_access) { @@ -3604,9 +3826,10 @@ namespace Realm { CUresult ret; { AutoGPUContext agc(gpus[0]); - ret = cuMemHostRegister(base, (*it)->size, - CU_MEMHOSTREGISTER_PORTABLE | - CU_MEMHOSTREGISTER_DEVICEMAP); + ret = CUDA_DRIVER_FNPTR(cuMemHostRegister)(base, + (*it)->size, + CU_MEMHOSTREGISTER_PORTABLE | + CU_MEMHOSTREGISTER_DEVICEMAP); } if(ret != CUDA_SUCCESS) { log_gpu.info() << "failed to register mem " << (*it)->me << " (" << base << " + " << (*it)->size << ") : " @@ -3622,7 +3845,9 @@ namespace Realm { CUresult ret; { AutoGPUContext agc(gpus[i]); - ret = cuMemHostGetDevicePointer(&gpuptr, base, 0); + ret = CUDA_DRIVER_FNPTR(cuMemHostGetDevicePointer)(&gpuptr, + base, + 0); } if(ret == CUDA_SUCCESS) { // no test for && ((void *)gpuptr == base)) { @@ -3684,19 +3909,19 @@ namespace Realm { if(zcmem_cpu_base) { assert(!gpus.empty()); AutoGPUContext agc(gpus[0]); - CHECK_CU( cuMemFreeHost(zcmem_cpu_base) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemFreeHost)(zcmem_cpu_base) ); } if(zcib_cpu_base) { assert(!gpus.empty()); AutoGPUContext agc(gpus[0]); - CHECK_CU( cuMemFreeHost(zcib_cpu_base) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemFreeHost)(zcib_cpu_base) ); } if(uvm_base) { assert(!gpus.empty()); AutoGPUContext agc(gpus[0]); - CHECK_CU( cuMemFree(reinterpret_cast(uvm_base)) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemFree)(reinterpret_cast(uvm_base)) ); } // also unregister any host memory at this time @@ -3705,7 +3930,7 @@ namespace Realm { for(std::vector::const_iterator it = registered_host_ptrs.begin(); it != registered_host_ptrs.end(); ++it) - CHECK_CU( cuMemHostUnregister(*it) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemHostUnregister)(*it) ); registered_host_ptrs.clear(); } From 4da9b5774f8f99beeff178126d482cf41ccd07f5 Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Fri, 10 Dec 2021 16:18:59 -0800 Subject: [PATCH 16/36] ci: add test for dynamic loading of cuda libs --- .gitlab-ci.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 363c73c3fe..6cb253f480 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -160,6 +160,8 @@ variables: USE_LLVM: "1" .cuda: &cuda USE_CUDA: "1" +.cuda_dynamic: &cuda_dynamic + EXTRA_CMAKE_ARGS: "-DLegion_CUDA_DYNAMIC_LOAD=ON -DLegion_HIJACK_CUDART=OFF" .gasnet1_mpi: &gasnet1_mpi REALM_NETWORKS: "gasnet1" CONDUIT: "mpi" @@ -1248,10 +1250,10 @@ p100_cuda102_gcc8_cxx11_release_cmake_cuda_legion: <<: [*p100, *tests] variables: <<: [*gcc8, *release, *cxx11_normal, *p100_cuda102, *cmake, *legion, *ctest] -p100_cuda110_gcc9_cxx14_debug_cmake_cuda_legion: +p100_cuda110_gcc9_cxx14_debug_cmake_cuda_dynamic_legion: <<: [*p100, *tests] variables: - <<: [*gcc9, *debug, *cxx14_normal, *p100_cuda110, *cmake, *legion, *ctest, *short] + <<: [*gcc9, *debug, *cxx14_normal, *p100_cuda110, *cmake, *legion, *cuda_dynamic, *ctest, *short] p100_cuda110_gcc9_cxx14_release_cuda_legion: <<: [*p100, *tests] variables: From c596071bd4fc30b9b290f8cdd8d8d791ad780a77 Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Mon, 13 Dec 2021 12:40:40 -0800 Subject: [PATCH 17/36] test: fix shutdown race in task_throughput test --- .../realm/task_throughput/task_throughput.cc | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/test/performance/realm/task_throughput/task_throughput.cc b/test/performance/realm/task_throughput/task_throughput.cc index ef1f0563df..d41fb7445c 100644 --- a/test/performance/realm/task_throughput/task_throughput.cc +++ b/test/performance/realm/task_throughput/task_throughput.cc @@ -202,6 +202,9 @@ void task_launcher(const void *args, size_t arglen, if(TestConfig::tasks_per_processor < 2) TestConfig::tasks_per_processor = 2; + // keep completion events for all tasks for race-free cleanup + std::vector events; + for(int i = 0; i < TestConfig::tasks_per_processor; i++) { int which = ((i == 0) ? FIRST_TASK : (i == (TestConfig::tasks_per_processor - 1)) ? LAST_TASK : @@ -224,6 +227,7 @@ void task_launcher(const void *args, size_t arglen, Event e = (*it).spawn(task_id, tta, argsize, prs, preconds[*it], which); if(TestConfig::chain_tasks) preconds[*it] = e; + events.push_back(e); total_tasks++; } } @@ -248,6 +252,9 @@ void task_launcher(const void *args, size_t arglen, if(TestConfig::skip_launch_procs) la.finish_barrier.arrive(); + + // don't actually terminate until all the tasks we spawned are done + Event::merge_events(events).wait(); } void top_level_task(const void *args, size_t arglen, @@ -332,6 +339,8 @@ void top_level_task(const void *args, size_t arglen, } // spawn launcher tasks in each address space + std::vector launchers_done; + for(std::map >::const_iterator it = all_procs.begin(); it != all_procs.end(); ++it) { @@ -340,14 +349,17 @@ void top_level_task(const void *args, size_t arglen, for(int i = 0; i < TestConfig::launching_processors; i++) { Processor p = lp[i]; - // no need to grab the finish event - we wait indirectly via the barrier - p.spawn(TASK_LAUNCHER, args_data, args_size); + Event e = p.spawn(TASK_LAUNCHER, args_data, args_size); + launchers_done.push_back(e); } } free(args_data); // all done - wait for everything to finish via the finish_barrier launch_args.finish_barrier.wait(); + + // for orderly shutdown, make sure the launcher tasks themselves are done + Event::merge_events(launchers_done).wait(); } int main(int argc, char **argv) From ffa66700abea11622e97ee38c4db49938403d5d5 Mon Sep 17 00:00:00 2001 From: Patrick McCormick <651611+pmccormick@users.noreply.github.com> Date: Wed, 3 Nov 2021 12:04:08 -0600 Subject: [PATCH 18/36] CMake update to expose parameters for setting the maximum number of supported nodes and processors-per-node (Legion_MAX_NUM_NODES, Legion_MAX_NUM_PROCS). --- CMakeLists.txt | 50 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e5134d8cb6..853e7497f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,7 +175,7 @@ set(CMAKE_ENABLE_EXPORTS ON) #------------------------------------------------------------------------------# # For now we want the optimization flags to match on both normal make and cmake -# builds so we override the cmake defaults here for release, this changes +# builds so we override the cmake defaults here for release, this changes # -O3 to -O2 and removes -DNDEBUG set(CMAKE_CXX_FLAGS_RELEASE "-O2") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g") @@ -263,6 +263,44 @@ endif() set(BINDINGS_DEFAULT_MODULE "" CACHE STRING "module to load by default in Python bindings, if any") +#------------------------------------------------------------------------------# +# System configuration and limits +#------------------------------------------------------------------------------# +function(is_power_of_two x ret) + set(${ret} FALSE PARENT_SCOPE) + if (${x} LESS_EQUAL 0) + return() + endif() + math(EXPR y "${x} & (${x} - 1)") + if (${y} EQUAL 0) + set(${ret} TRUE PARENT_SCOPE) + endif() +endfunction() + +if (NOT DEFINED Legion_MAX_NUM_NODES) + set(Legion_MAX_NUM_NODES 1024 CACHE STRING + "Maximum number of nodes supported by the runtime -- MUST be a power of two.") + mark_as_advanced(Legion_MAX_NUM_NODES) +else() + is_power_of_two(${Legion_MAX_NUM_NODES} is_pow_two) + if (NOT is_pow_two) + message(FATAL_ERROR "Legion_MAX_NUM_NODES must be a power of two.") + endif() +endif() +add_compile_definitions(LEGION_MAX_NUM_NODES=${Legion_MAX_NUM_NODES}) + +if (NOT DEFINED Legion_MAX_NUM_PROCS) + set(Legion_MAX_NUM_PROCS 64 CACHE STRING + "Maximum number of processors (per node) supported by the runtime -- MUST be a power of two.") + mark_as_advanced(Legion_MAX_NUM_PROCS) +else() + is_power_of_two(${Legion_MAX_NUM_PROCS} is_pow_two) + if (NOT is_pow_two) + message(FATAL_ERROR "Legion_MAX_NUM_PROCS must be a power of two.") + endif() +endif() +add_compile_definitions(LEGION_MAX_NUM_PROCS=${Legion_MAX_NUM_PROCS}) + #------------------------------------------------------------------------------# # Kokkos configuration #------------------------------------------------------------------------------# @@ -374,9 +412,9 @@ if(Legion_USE_HIP) install(FILES ${Legion_SOURCE_DIR}/cmake/newcmake/FindCUDA.cmake DESTINATION ${CMAKE_INSTALL_DATADIR}/Legion/cmake/newcmake ) - set(HIPCC_FLAGS "-D__HIP_PLATFORM_NVCC__") + set(HIPCC_FLAGS "-D__HIP_PLATFORM_NVCC__") endif() - + if (Legion_HIP_TARGET STREQUAL "ROCM") set(Legion_HIP_ARCH "" CACHE STRING "Comma-separated list of HIP architectures to build for (e.g. gfx906,gfx908)") @@ -390,7 +428,7 @@ if(Legion_USE_HIP) set(HIP_GENCODE "--amdgpu-target=${Legion_HIP_ARCH}") endif() endif() - + # find the hip library find_package(HIP REQUIRED) @@ -579,7 +617,7 @@ if(Legion_USE_ZLIB) # define variable for legion_defines.h set(LEGION_USE_ZLIB ON) endif() - + #------------------------------------------------------------------------------# # Fortran configuration #------------------------------------------------------------------------------# @@ -745,7 +783,7 @@ if(Legion_BUILD_ALL OR Legion_BUILD_APPS OR Legion_BUILD_BINDINGS OR Legion_BUIL set(PROP $) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} $<$:-I$>") endif() - + if(Legion_HIP_TARGET STREQUAL "CUDA") set(PROP $) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${PROP}") From 09f30796d21de21e068e023888bcf3627c2509a0 Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Mon, 8 Nov 2021 16:02:17 -0800 Subject: [PATCH 19/36] build: use cmakedefines instead of compile_definitions --- CMakeLists.txt | 81 +++++++++++++++++++-------------------- cmake/legion_defines.h.in | 4 ++ 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 853e7497f2..eaada1e7dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -263,44 +263,6 @@ endif() set(BINDINGS_DEFAULT_MODULE "" CACHE STRING "module to load by default in Python bindings, if any") -#------------------------------------------------------------------------------# -# System configuration and limits -#------------------------------------------------------------------------------# -function(is_power_of_two x ret) - set(${ret} FALSE PARENT_SCOPE) - if (${x} LESS_EQUAL 0) - return() - endif() - math(EXPR y "${x} & (${x} - 1)") - if (${y} EQUAL 0) - set(${ret} TRUE PARENT_SCOPE) - endif() -endfunction() - -if (NOT DEFINED Legion_MAX_NUM_NODES) - set(Legion_MAX_NUM_NODES 1024 CACHE STRING - "Maximum number of nodes supported by the runtime -- MUST be a power of two.") - mark_as_advanced(Legion_MAX_NUM_NODES) -else() - is_power_of_two(${Legion_MAX_NUM_NODES} is_pow_two) - if (NOT is_pow_two) - message(FATAL_ERROR "Legion_MAX_NUM_NODES must be a power of two.") - endif() -endif() -add_compile_definitions(LEGION_MAX_NUM_NODES=${Legion_MAX_NUM_NODES}) - -if (NOT DEFINED Legion_MAX_NUM_PROCS) - set(Legion_MAX_NUM_PROCS 64 CACHE STRING - "Maximum number of processors (per node) supported by the runtime -- MUST be a power of two.") - mark_as_advanced(Legion_MAX_NUM_PROCS) -else() - is_power_of_two(${Legion_MAX_NUM_PROCS} is_pow_two) - if (NOT is_pow_two) - message(FATAL_ERROR "Legion_MAX_NUM_PROCS must be a power of two.") - endif() -endif() -add_compile_definitions(LEGION_MAX_NUM_PROCS=${Legion_MAX_NUM_PROCS}) - #------------------------------------------------------------------------------# # Kokkos configuration #------------------------------------------------------------------------------# @@ -638,12 +600,47 @@ mark_as_advanced(Legion_MAX_DIM) set(LEGION_MAX_DIM ${Legion_MAX_DIM}) set(REALM_MAX_DIM ${Legion_MAX_DIM}) -set(Legion_MAX_FIELDS 512 CACHE STRING "Maximum number of fields allocated to a single field space") -set_property(CACHE Legion_MAX_FIELDS PROPERTY STRINGS 32 64 128 256 512 1024) -mark_as_advanced(Legion_MAX_FIELDS) +# legion uses statically-sized arrays for performance in some cases - make +# sure they're big enough for your use case +set(Legion_MAX_FIELDS 512 CACHE STRING + "Maximum number of fields allocated to a single field space -- MUST be a power of two.") +set(Legion_MAX_NUM_NODES 1024 CACHE STRING + "Maximum number of nodes supported by the runtime -- MUST be a power of two.") +set(Legion_MAX_NUM_PROCS 64 CACHE STRING + "Maximum number of processors (per node) supported by the runtime -- MUST be a power of two.") + +# check that they're all powers of two -# define variable for legion_defines.h +function(is_power_of_two x ret) + set(${ret} FALSE PARENT_SCOPE) + if (${x} LESS_EQUAL 0) + return() + endif() + math(EXPR y "${x} & (${x} - 1)") + if (${y} EQUAL 0) + set(${ret} TRUE PARENT_SCOPE) + endif() +endfunction() + +is_power_of_two(${Legion_MAX_FIELDS} is_pow_two) +if (NOT is_pow_two) + message(FATAL_ERROR "Legion_MAX_FIELDS must be a power of two.") +endif() + +is_power_of_two(${Legion_MAX_NUM_NODES} is_pow_two) +if (NOT is_pow_two) + message(FATAL_ERROR "Legion_MAX_NUM_NODES must be a power of two.") +endif() + +is_power_of_two(${Legion_MAX_NUM_PROCS} is_pow_two) +if (NOT is_pow_two) + message(FATAL_ERROR "Legion_MAX_NUM_PROCS must be a power of two.") +endif() + +# define variables for legion_defines.h set(LEGION_MAX_FIELDS ${Legion_MAX_FIELDS}) +set(LEGION_MAX_NUM_NODES ${Legion_MAX_NUM_NODES}) +set(LEGION_MAX_NUM_PROCS ${Legion_MAX_NUM_PROCS}) option(Legion_WARNINGS_FATAL "Make all runtime warnings fatal" OFF) set(LEGION_WARNINGS_FATAL ${Legion_WARNINGS_FATAL}) diff --git a/cmake/legion_defines.h.in b/cmake/legion_defines.h.in index 66df1bda7c..2517ec32fa 100644 --- a/cmake/legion_defines.h.in +++ b/cmake/legion_defines.h.in @@ -23,6 +23,10 @@ #cmakedefine LEGION_MAX_FIELDS @LEGION_MAX_FIELDS@ +#cmakedefine LEGION_MAX_NUM_NODES @LEGION_MAX_NUM_NODES@ + +#cmakedefine LEGION_MAX_NUM_PROCS @LEGION_MAX_NUM_PROCS@ + #cmakedefine LEGION_USE_CUDA #cmakedefine LEGION_GPU_REDUCTIONS From 6bc8030e1b33b5be4c326ac0eb9223b2260b1a1c Mon Sep 17 00:00:00 2001 From: Mike Bauer Date: Mon, 13 Dec 2021 17:58:51 -0800 Subject: [PATCH 20/36] legion: more work on reference counting for distributed index space expressions --- runtime/legion/garbage_collection.cc | 89 +++++++++ runtime/legion/garbage_collection.h | 43 ++++- runtime/legion/legion_analysis.cc | 87 +++++---- runtime/legion/legion_analysis.h | 20 +- runtime/legion/legion_context.cc | 2 +- runtime/legion/legion_context.h | 14 +- runtime/legion/legion_instances.cc | 93 +++++----- runtime/legion/legion_instances.h | 30 ++- runtime/legion/legion_ops.h | 3 +- runtime/legion/legion_tasks.cc | 2 +- runtime/legion/legion_types.h | 36 ++-- runtime/legion/legion_views.cc | 11 +- runtime/legion/region_tree.cc | 262 +++++++++++++++++++-------- runtime/legion/region_tree.h | 41 ++++- runtime/legion/region_tree.inl | 28 ++- runtime/legion/runtime.cc | 104 ++++------- 16 files changed, 534 insertions(+), 331 deletions(-) diff --git a/runtime/legion/garbage_collection.cc b/runtime/legion/garbage_collection.cc index 840c685068..d3b185827b 100644 --- a/runtime/legion/garbage_collection.cc +++ b/runtime/legion/garbage_collection.cc @@ -31,6 +31,7 @@ namespace Legion { //-------------------------------------------------------------------------- LocalReferenceMutator::LocalReferenceMutator( const LocalReferenceMutator &rhs) + : waiter(rhs.waiter) //-------------------------------------------------------------------------- { // should never be called @@ -43,6 +44,9 @@ namespace Legion { { if (!mutation_effects.empty()) { +#ifdef DEBUG_LEGION + assert(waiter); +#endif RtEvent wait_on = Runtime::merge_events(mutation_effects); wait_on.wait(); } @@ -69,6 +73,9 @@ namespace Legion { RtEvent LocalReferenceMutator::get_done_event(void) //-------------------------------------------------------------------------- { +#ifdef DEBUG_LEGION + assert(!waiter); +#endif if (mutation_effects.empty()) return RtEvent::NO_RT_EVENT; RtEvent result = Runtime::merge_events(mutation_effects); @@ -108,6 +115,20 @@ namespace Legion { mutation_effects.insert(ev); } + ///////////////////////////////////////////////////////////// + // ImplicitReferenceTracker + ///////////////////////////////////////////////////////////// + + //-------------------------------------------------------------------------- + ImplicitReferenceTracker::~ImplicitReferenceTracker(void) + //-------------------------------------------------------------------------- + { + for (std::vector::const_iterator it = + live_expressions.begin(); it != live_expressions.end(); it++) + if ((*it)->remove_base_expression_reference(LIVE_EXPR_REF)) + delete (*it); + } + ///////////////////////////////////////////////////////////// // DistributedCollectable ///////////////////////////////////////////////////////////// @@ -1800,6 +1821,23 @@ namespace Legion { #ifdef DEBUG_LEGION assert(count != 0); assert(registered_with_runtime); +#endif +#if 0 + // If there is no mutator or it is a non-waiting mutator then we + // can buffer this up in the implicit reference tracker and send it + // at the end of the runtime call or meta-task + if ((mutator == NULL) || !mutator->is_waiting_mutator()) + { + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + const RtEvent send_event = + implicit_reference_tracker->record_valid_increment(did, target, + precondition, count); + if (mutator != NULL) + mutator->record_reference_mutation_effect( + implicit_reference_tracker->get_effects_event()); + return send_event; + } #endif RtUserEvent done_event; if (mutator != NULL) @@ -1836,6 +1874,23 @@ namespace Legion { #ifdef DEBUG_LEGION assert(count != 0); assert(registered_with_runtime); +#endif +#if 0 + // If there is no mutator or it is a non-waiting mutator then we + // can buffer this up in the implicit reference tracker and send it + // at the end of the runtime call or meta-task + if ((mutator == NULL) || !mutator->is_waiting_mutator()) + { + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + const RtEvent send_event = + implicit_reference_tracker->record_valid_decrement(did, target, + precondition, count); + if (mutator != NULL) + mutator->record_reference_mutation_effect( + implicit_reference_tracker->get_effects_event()); + return send_event; + } #endif RtUserEvent done_event; if (mutator != NULL) @@ -1872,6 +1927,23 @@ namespace Legion { #ifdef DEBUG_LEGION assert(count != 0); assert(registered_with_runtime); +#endif +#if 0 + // If there is no mutator or it is a non-waiting mutator then we + // can buffer this up in the implicit reference tracker and send it + // at the end of the runtime call or meta-task + if ((mutator == NULL) || !mutator->is_waiting_mutator()) + { + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + const RtEvent send_event = + implicit_reference_tracker->record_gc_increment(did, target, + precondition, count); + if (mutator != NULL) + mutator->record_reference_mutation_effect( + implicit_reference_tracker->get_effects_event()); + return send_event; + } #endif RtUserEvent done_event; if (mutator != NULL) @@ -1908,6 +1980,23 @@ namespace Legion { #ifdef DEBUG_LEGION assert(count != 0); assert(registered_with_runtime); +#endif +#if 0 + // If there is no mutator or it is a non-waiting mutator then we + // can buffer this up in the implicit reference tracker and send it + // at the end of the runtime call or meta-task + if ((mutator == NULL) || !mutator->is_waiting_mutator()) + { + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + const RtEvent send_event = + implicit_reference_tracker->record_gc_increment(did, target, + precondition, count); + if (mutator != NULL) + mutator->record_reference_mutation_effect( + implicit_reference_tracker->get_effects_event()); + return send_event; + } #endif RtUserEvent done_event; if (mutator != NULL) diff --git a/runtime/legion/garbage_collection.h b/runtime/legion/garbage_collection.h index 7166e010ec..ae2c09445d 100644 --- a/runtime/legion/garbage_collection.h +++ b/runtime/legion/garbage_collection.h @@ -182,6 +182,7 @@ namespace Legion { */ class ReferenceMutator { public: + virtual bool is_waiting_mutator(void) const = 0; virtual void record_reference_mutation_effect(RtEvent event) = 0; }; @@ -193,17 +194,19 @@ namespace Legion { */ class LocalReferenceMutator : public ReferenceMutator { public: - LocalReferenceMutator(void) { } + LocalReferenceMutator(bool wait) : waiter(wait) { } LocalReferenceMutator(const LocalReferenceMutator &rhs); ~LocalReferenceMutator(void); public: LocalReferenceMutator& operator=(const LocalReferenceMutator &rhs); public: + virtual bool is_waiting_mutator(void) const { return waiter; } virtual void record_reference_mutation_effect(RtEvent event); public: RtEvent get_done_event(void); private: std::set mutation_effects; + const bool waiter; }; /** @@ -219,18 +222,46 @@ namespace Legion { public: WrapperReferenceMutator& operator=(const WrapperReferenceMutator &rhs); public: + virtual bool is_waiting_mutator(void) const { return false; } virtual void record_reference_mutation_effect(RtEvent event); private: std::set &mutation_effects; }; /** - * \class IgnoreReferenceMutator - * This will ignore any reference effects + * \class ImplicitReferenceTracker + * This class tracks implicit references that are held either by + * an application runtime API call or a meta-task. At the end of the + * runtime API call or meta-task the references are updated. */ - class IgnoreReferenceMutator : public ReferenceMutator { - public: - virtual void record_reference_mutation_effect(RtEvent event) { } + class ImplicitReferenceTracker { + public: + ImplicitReferenceTracker(void) { } + ImplicitReferenceTracker(const ImplicitReferenceTracker&) = delete; + ~ImplicitReferenceTracker(void); + public: + ImplicitReferenceTracker& operator=( + const ImplicitReferenceTracker&) = delete; + public: + inline void record_live_expression(IndexSpaceExpression *expr) + { live_expressions.emplace_back(expr); } +#if 0 + public: + RtEvent record_valid_increment(DistributedID did, + AddressSpaceID target, + unsigned count); + RtEvent record_valid_decrement(DistributedID did, + AddressSpaceID target, + unsigned count); + RtEvent record_gc_increment(DistributedID did, + AddressSpaceID target, + unsigned count); + RtEvent record_gc_decrement(DistributedID did, + AddressSpaceID target, + unsigned count); +#endif + private: + std::vector live_expressions; }; /** diff --git a/runtime/legion/legion_analysis.cc b/runtime/legion/legion_analysis.cc index 90d8440167..a929dbae3f 100644 --- a/runtime/legion/legion_analysis.cc +++ b/runtime/legion/legion_analysis.cc @@ -130,8 +130,8 @@ namespace Legion { } //-------------------------------------------------------------------------- - void PhysicalUser::pack_user(Serializer &rez, - const AddressSpaceID target) const + void PhysicalUser::pack_user(Serializer &rez, const AddressSpaceID target, + bool need_reference) const //-------------------------------------------------------------------------- { RezCheck z(rez); @@ -139,12 +139,11 @@ namespace Legion { rez.serialize(collect_event); #endif rez.serialize(usage); - expr->pack_expression(rez, target); + expr->pack_expression(rez, target, need_reference); rez.serialize(op_id); rez.serialize(index); rez.serialize(copy_user); rez.serialize(covers); - } //-------------------------------------------------------------------------- @@ -10082,7 +10081,7 @@ namespace Legion { if (is_logical_owner() || initial_refinement) { // We're the owner so we can do the merge - LocalReferenceMutator mutator; + LocalReferenceMutator mutator(false/*waiter*/); for (FieldMaskSet::const_iterator it = new_views.begin(); it != new_views.end(); it++) if (valid_instances.insert(it->first, it->second)) @@ -10611,7 +10610,7 @@ namespace Legion { transition_event = RtUserEvent::NO_RT_USER_EVENT; } eq_state = MAPPING_STATE; - LocalReferenceMutator mutator; + LocalReferenceMutator mutator(false/*waiter*/); // Add references to all the views that we've loaded for (FieldMaskSet::const_iterator it = valid_instances.begin(); it != valid_instances.end(); it++) @@ -13559,7 +13558,8 @@ namespace Legion { else rez.serialize(logical_owner_space); } - set_expr->pack_expression(rez, target); + // No need for a reference here since we know we'll continue holding it + set_expr->pack_expression(rez, target, false/*need reference*/); if (index_space_node != NULL) rez.serialize(index_space_node->handle); else @@ -13886,7 +13886,7 @@ namespace Legion { const RemoteRefTaskArgs *rargs = (const RemoteRefTaskArgs*)args; if (rargs->done_event.exists()) { - LocalReferenceMutator mutator; + LocalReferenceMutator mutator(false/*waiter*/); if (rargs->add_references) { for (std::map::const_iterator it = @@ -13925,16 +13925,14 @@ namespace Legion { RayTracer *t, IndexSpaceExpression *e, IndexSpace h, AddressSpaceID o, RtUserEvent d, RtUserEvent def, const FieldMask &m, - bool local, bool is_expr_s, IndexSpace expr_h, - IndexSpaceExprID expr_i) + const PendingRemoteExpression *p) : LgTaskArgs(implicit_provenance), - set(s), target(t), expr(local ? e : NULL), handle(h), origin(o), + set(s), target(t), expr(e), handle(h), origin(o), done(d), deferral(def), ray_mask(new FieldMask(m)), - expr_handle(expr_h), expr_id(expr_i), is_local(local), - is_expr_space(is_expr_s) + pending((p == NULL) ? NULL : new PendingRemoteExpression(*p)) //-------------------------------------------------------------------------- { - if (local) + if (expr != NULL) expr->add_base_expression_reference(META_TASK_REF); } @@ -13946,18 +13944,20 @@ namespace Legion { const DeferRayTraceArgs *dargs = (const DeferRayTraceArgs*)args; // See if we need to load the expression or not - IndexSpaceExpression *expr = (dargs->is_local) ? dargs->expr : - (dargs->is_expr_space) ? runtime->forest->get_node(dargs->expr_handle) - : runtime->forest->find_remote_expression(dargs->expr_id); + IndexSpaceExpression *expr = dargs->expr; + if (expr == NULL) + expr = runtime->forest->find_remote_expression(*(dargs->pending)); dargs->set->ray_trace_equivalence_sets(dargs->target, expr, *(dargs->ray_mask), dargs->handle, dargs->origin, dargs->done, dargs->deferral); // Clean up our ray mask delete dargs->ray_mask; // Remove our expression reference too - if (dargs->is_local && + if ((dargs->expr != NULL) && dargs->expr->remove_base_expression_reference(META_TASK_REF)) delete dargs->expr; + if (dargs->pending != NULL) + delete dargs->pending; } //-------------------------------------------------------------------------- @@ -14049,14 +14049,15 @@ namespace Legion { //-------------------------------------------------------------------------- EquivalenceSet::DeferResponseArgs::DeferResponseArgs(DistributedID id, AddressSpaceID src, AddressSpaceID log, - IndexSpaceExpression *ex, bool local, bool is_space, - IndexSpace expr_h, IndexSpaceExprID xid, IndexSpace h) + IndexSpaceExpression *ex, + const PendingRemoteExpression &p, IndexSpace h) : LgTaskArgs(implicit_provenance), - did(id), source(src), logical_owner(log), expr(ex), is_local(local), - is_index_space(is_space), expr_handle(expr_h), expr_id(xid), handle(h) + did(id), source(src), logical_owner(log), expr(ex), + pending((expr != NULL) ? NULL : new PendingRemoteExpression(p)), + handle(h) //-------------------------------------------------------------------------- { - if (is_local) + if (expr != NULL) expr->add_base_expression_reference(META_TASK_REF); } @@ -14070,11 +14071,11 @@ namespace Legion { derez.deserialize(did); AddressSpaceID logical_owner; derez.deserialize(logical_owner); - bool is_local, is_index_space; - IndexSpace expr_handle; IndexSpaceExprID expr_id; RtEvent wait_for; + PendingRemoteExpression pending; + RtEvent wait_for; IndexSpaceExpression *expr = - IndexSpaceExpression::unpack_expression(derez, runtime->forest, source, - is_local, is_index_space, expr_handle, expr_id, wait_for); + IndexSpaceExpression::unpack_expression(derez, runtime->forest, + source, pending, wait_for); IndexSpace handle; derez.deserialize(handle); IndexSpaceNode *node = NULL; RtEvent wait_on; @@ -14096,16 +14097,15 @@ namespace Legion { precondition = wait_on; if (precondition.exists() && !precondition.has_triggered()) { - DeferResponseArgs args(did, source, logical_owner, expr, is_local, - is_index_space, expr_handle, expr_id, handle); + DeferResponseArgs args(did, source, logical_owner, expr, + pending, handle); runtime->issue_runtime_meta_task(args, LG_LATENCY_MESSAGE_PRIORITY, precondition); return; } // If we fall through we need to refetch things that we didn't get if (expr == NULL) - expr = is_index_space ? runtime->forest->get_node(expr_handle) : - runtime->forest->find_remote_expression(expr_id); + expr = runtime->forest->find_remote_expression(pending); if (handle.exists() && (node == NULL)) node = runtime->forest->get_node(handle); } @@ -14128,9 +14128,9 @@ namespace Legion { //-------------------------------------------------------------------------- { const DeferResponseArgs *dargs = (const DeferResponseArgs*)args; - IndexSpaceExpression *expr = (dargs->is_local) ? dargs->expr : - (dargs->is_index_space) ? runtime->forest->get_node(dargs->expr_handle) - : runtime->forest->find_remote_expression(dargs->expr_id); + IndexSpaceExpression *expr = dargs->expr; + if (expr == NULL) + expr = runtime->forest->find_remote_expression(*(dargs->pending)); IndexSpaceNode *node = NULL; if (dargs->handle.exists() && (dargs->logical_owner == runtime->address_space)) @@ -14147,9 +14147,11 @@ namespace Legion { // Once construction is complete then we do the registration set->register_with_runtime(NULL/*no remote registration needed*/); // Remove our expression reference too - if (dargs->is_local && + if ((dargs->expr != NULL) && dargs->expr->remove_base_expression_reference(META_TASK_REF)) delete dargs->expr; + if (dargs->pending != NULL) + delete dargs->pending; } //-------------------------------------------------------------------------- @@ -14233,13 +14235,11 @@ namespace Legion { RayTracer *target; derez.deserialize(target); - bool is_local, is_expr_space; - IndexSpace expr_handle; - IndexSpaceExprID expr_id; + PendingRemoteExpression pending; RtEvent expr_ready; IndexSpaceExpression *expr = - IndexSpaceExpression::unpack_expression(derez, runtime->forest, source, - is_local, is_expr_space, expr_handle, expr_id, expr_ready); + IndexSpaceExpression::unpack_expression(derez, runtime->forest, + source, pending, expr_ready); FieldMask ray_mask; derez.deserialize(ray_mask); IndexSpace handle; @@ -14257,15 +14257,13 @@ namespace Legion { DeferRayTraceArgs args(set, target, expr, handle, origin, done_event, RtUserEvent::NO_RT_USER_EVENT, - ray_mask, is_local, is_expr_space, - expr_handle, expr_id); + ray_mask, &pending); runtime->issue_runtime_meta_task(args, LG_THROUGHPUT_DEFERRED_PRIORITY, defer); return; } if (expr_ready.exists()) - expr = (is_expr_space) ? runtime->forest->get_node(expr_handle) : - runtime->forest->find_remote_expression(expr_id); + expr = runtime->forest->find_remote_expression(pending); // Fall through and actually do the operation now } set->ray_trace_equivalence_sets(target, expr, ray_mask, handle, @@ -14314,7 +14312,6 @@ namespace Legion { RtUserEvent done; derez.deserialize(done); - LocalReferenceMutator mutator; if (ready.exists() && !ready.has_triggered()) ready.wait(); set->unpack_migration(derez, source, done); diff --git a/runtime/legion/legion_analysis.h b/runtime/legion/legion_analysis.h index 58c2205b1e..116d605d78 100644 --- a/runtime/legion/legion_analysis.h +++ b/runtime/legion/legion_analysis.h @@ -705,7 +705,8 @@ namespace Legion { public: PhysicalUser& operator=(const PhysicalUser &rhs); public: - void pack_user(Serializer &rez, const AddressSpaceID target) const; + void pack_user(Serializer &rez, const AddressSpaceID target, + bool need_reference = true) const; static PhysicalUser* unpack_user(Deserializer &derez, RegionTreeForest *forest, const AddressSpaceID source); public: @@ -2120,9 +2121,7 @@ namespace Legion { // These are just for the case where the // request comes from a remote node and // we're waiting for the expression to load - bool is_local=true, bool is_expr_s = false, - IndexSpace expr_h = IndexSpace::NO_SPACE, - IndexSpaceExprID expr_i = 0); + const PendingRemoteExpression *pending = NULL); public: EquivalenceSet *const set; RayTracer *const target; @@ -2132,10 +2131,7 @@ namespace Legion { const RtUserEvent done; const RtUserEvent deferral; FieldMask *const ray_mask; - const IndexSpace expr_handle; - const IndexSpaceExprID expr_id; - const bool is_local; - const bool is_expr_space; + const PendingRemoteExpression *const pending; }; struct DeferRayTraceFinishArgs : public LgTaskArgs { @@ -2213,17 +2209,13 @@ namespace Legion { public: DeferResponseArgs(DistributedID id, AddressSpaceID src, AddressSpaceID log, IndexSpaceExpression *ex, - bool local, bool is_space, IndexSpace expr_h, - IndexSpaceExprID xid, IndexSpace h); + const PendingRemoteExpression &pending, IndexSpace h); public: const DistributedID did; const AddressSpaceID source; const AddressSpaceID logical_owner; IndexSpaceExpression *const expr; - const bool is_local; - const bool is_index_space; - const IndexSpace expr_handle; - const IndexSpaceExprID expr_id; + const PendingRemoteExpression *const pending; const IndexSpace handle; }; struct DeferRemoveRefArgs : public LgTaskArgs { diff --git a/runtime/legion/legion_context.cc b/runtime/legion/legion_context.cc index 2da9ccc413..7e4fc5daf2 100644 --- a/runtime/legion/legion_context.cc +++ b/runtime/legion/legion_context.cc @@ -5785,7 +5785,7 @@ namespace Legion { const DistributedID did = runtime->get_available_distributed_id(); FutureMapImpl *impl = new FutureMapImpl(this, runtime, did, runtime->address_space, RtEvent::NO_RT_EVENT); - LocalReferenceMutator mutator; + LocalReferenceMutator mutator(true/*waiter*/); for (std::map::const_iterator it = data.begin(); it != data.end(); it++) { diff --git a/runtime/legion/legion_context.h b/runtime/legion/legion_context.h index ee2d2b16cb..8e6452579f 100644 --- a/runtime/legion/legion_context.h +++ b/runtime/legion/legion_context.h @@ -2064,7 +2064,7 @@ namespace Legion { //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION - assert(implicit_live_expressions == NULL); + assert(implicit_reference_tracker == NULL); #endif if (overhead_tracker == NULL) return; @@ -2078,16 +2078,10 @@ namespace Legion { inline void TaskContext::end_runtime_call(void) //-------------------------------------------------------------------------- { - if (implicit_live_expressions != NULL) + if (implicit_reference_tracker != NULL) { - // Remove references to any live index space expressions we have - for (std::vector::const_iterator it = - implicit_live_expressions->begin(); it != - implicit_live_expressions->end(); it++) - if ((*it)->remove_base_expression_reference(LIVE_EXPR_REF)) - delete (*it); - delete implicit_live_expressions; - implicit_live_expressions = NULL; + delete implicit_reference_tracker; + implicit_reference_tracker = NULL; } if (overhead_tracker == NULL) return; diff --git a/runtime/legion/legion_instances.cc b/runtime/legion/legion_instances.cc index f6ccf78777..83be9c6656 100644 --- a/runtime/legion/legion_instances.cc +++ b/runtime/legion/legion_instances.cc @@ -1479,7 +1479,8 @@ namespace Legion { rez.serialize(memory_manager->memory); rez.serialize(instance); rez.serialize(instance_footprint); - instance_domain->pack_expression(rez, target); + // No need for a reference here since we know we'll continue holding it + instance_domain->pack_expression(rez, target, false/*need reference*/); rez.serialize(piece_list_size); if (piece_list_size > 0) rez.serialize(piece_list, piece_list_size); @@ -1510,13 +1511,11 @@ namespace Legion { derez.deserialize(inst); size_t inst_footprint; derez.deserialize(inst_footprint); - bool local_is, domain_is; - IndexSpace domain_handle; - IndexSpaceExprID domain_expr_id; + PendingRemoteExpression pending; RtEvent domain_ready; IndexSpaceExpression *inst_domain = IndexSpaceExpression::unpack_expression(derez, runtime->forest, source, - local_is, domain_is, domain_handle, domain_expr_id, domain_ready); + pending, domain_ready); size_t piece_list_size; derez.deserialize(piece_list_size); void *piece_list = NULL; @@ -1551,18 +1550,16 @@ namespace Legion { { // We need to defer this instance creation DeferIndividualManagerArgs args(did, owner_space, mem, inst, - inst_footprint, local_is, inst_domain, domain_is, domain_handle, - domain_expr_id, handle, tree_id, layout_id, unique_event, redop, - piece_list, piece_list_size, shadow_inst); + inst_footprint, inst_domain, pending, + handle, tree_id, layout_id, unique_event, redop, + piece_list, piece_list_size, source, shadow_inst); runtime->issue_runtime_meta_task(args, LG_LATENCY_RESPONSE_PRIORITY, precondition); return; } // If we fall through we need to refetch things that we didn't get if (domain_ready.exists()) - inst_domain = domain_is ? - runtime->forest->get_node(domain_handle) : - runtime->forest->find_remote_expression(domain_expr_id); + inst_domain = runtime->forest->find_remote_expression(pending); if (fs_ready.exists()) space_node = runtime->forest->get_node(handle); if (layout_ready.exists()) @@ -1579,19 +1576,18 @@ namespace Legion { //-------------------------------------------------------------------------- IndividualManager::DeferIndividualManagerArgs::DeferIndividualManagerArgs( DistributedID d, AddressSpaceID own, Memory m, PhysicalInstance i, - size_t f, bool local, IndexSpaceExpression *lx, bool is, - IndexSpace dh, IndexSpaceExprID dx, FieldSpace h, RegionTreeID tid, + size_t f, IndexSpaceExpression *lx, + const PendingRemoteExpression &p, FieldSpace h, RegionTreeID tid, LayoutConstraintID l, ApEvent u, ReductionOpID r, const void *pl, - size_t pl_size, bool shadow) + size_t pl_size, AddressSpaceID src, bool shadow) : LgTaskArgs(implicit_provenance), - did(d), owner(own), mem(m), inst(i), footprint(f), local_is(local), - domain_is(is), local_expr(local ? lx : NULL), domain_handle(dh), - domain_expr(dx), handle(h), tree_id(tid), layout_id(l), - use_event(u), redop(r), piece_list(pl), piece_list_size(pl_size), - shadow_instance(shadow) + did(d), owner(own), mem(m), inst(i), footprint(f), pending(p), + local_expr(lx), handle(h), tree_id(tid), + layout_id(l), use_event(u), redop(r), piece_list(pl), + piece_list_size(pl_size), source(src), shadow_instance(shadow) //-------------------------------------------------------------------------- { - if (local_is) + if (local_expr != NULL) local_expr->add_base_expression_reference(META_TASK_REF); } @@ -1602,9 +1598,9 @@ namespace Legion { { const DeferIndividualManagerArgs *dargs = (const DeferIndividualManagerArgs*)args; - IndexSpaceExpression *inst_domain = dargs->local_is ? dargs->local_expr : - dargs->domain_is ? runtime->forest->get_node(dargs->domain_handle) : - runtime->forest->find_remote_expression(dargs->domain_expr); + IndexSpaceExpression *inst_domain = dargs->local_expr; + if (inst_domain == NULL) + inst_domain = runtime->forest->find_remote_expression(dargs->pending); FieldSpaceNode *space_node = runtime->forest->get_node(dargs->handle); LayoutConstraints *constraints = runtime->find_layout_constraints(dargs->layout_id); @@ -1613,7 +1609,7 @@ namespace Legion { dargs->piece_list_size, space_node, dargs->tree_id, constraints, dargs->use_event, dargs->redop, dargs->shadow_instance); // Remove the local expression reference if necessary - if (dargs->local_is && + if ((dargs->local_expr != NULL) && dargs->local_expr->remove_base_expression_reference(META_TASK_REF)) delete dargs->local_expr; } @@ -2905,7 +2901,8 @@ namespace Legion { rez.serialize(owner_space); rez.serialize(point_space->handle); rez.serialize(instance_footprint); - instance_domain->pack_expression(rez, target); + // No need for a reference here since we know we'll continue holding it + instance_domain->pack_expression(rez, target, false/*need reference*/); rez.serialize(field_space_node->handle); rez.serialize(tree_id); rez.serialize(redop); @@ -2933,13 +2930,11 @@ namespace Legion { runtime->forest->get_node(points_handle, &points_ready); size_t inst_footprint; derez.deserialize(inst_footprint); - bool local_is, domain_is; - IndexSpace domain_handle; - IndexSpaceExprID domain_expr_id; + PendingRemoteExpression pending; RtEvent domain_ready; IndexSpaceExpression *inst_domain = IndexSpaceExpression::unpack_expression(derez, runtime->forest, source, - local_is, domain_is, domain_handle, domain_expr_id, domain_ready); + pending, domain_ready); size_t piece_list_size; derez.deserialize(piece_list_size); void *piece_list = NULL; @@ -2981,9 +2976,8 @@ namespace Legion { { // We need to defer this instance creation DeferCollectiveManagerArgs args(did, owner_space, points_handle, - inst_footprint, local_is, inst_domain, domain_is, domain_handle, - domain_expr_id, handle, tree_id, layout_id, unique_event, redop, - piece_list, piece_list_size); + inst_footprint, inst_domain, pending, handle, tree_id, layout_id, + unique_event, redop, piece_list, piece_list_size, source); runtime->issue_runtime_meta_task(args, LG_LATENCY_RESPONSE_PRIORITY, precondition); return; @@ -2992,9 +2986,7 @@ namespace Legion { if (points_ready.exists()) point_space = runtime->forest->get_node(points_handle); if (domain_ready.exists()) - inst_domain = domain_is ? - runtime->forest->get_node(domain_handle) : - runtime->forest->find_remote_expression(domain_expr_id); + inst_domain = runtime->forest->find_remote_expression(pending); if (fs_ready.exists()) space_node = runtime->forest->get_node(handle); if (layout_ready.exists()) @@ -3010,18 +3002,17 @@ namespace Legion { //-------------------------------------------------------------------------- CollectiveManager::DeferCollectiveManagerArgs::DeferCollectiveManagerArgs( DistributedID d, AddressSpaceID own, IndexSpace points, - size_t f, bool local, IndexSpaceExpression *lx, bool is, - IndexSpace dh, IndexSpaceExprID dx, FieldSpace h, RegionTreeID tid, + size_t f, IndexSpaceExpression *lx, + const PendingRemoteExpression &p, FieldSpace h, RegionTreeID tid, LayoutConstraintID l, ApEvent use, ReductionOpID r, - const void *pl, size_t pl_size) + const void *pl, size_t pl_size, AddressSpace src) : LgTaskArgs(implicit_provenance), - did(d), owner(own), point_space(points), footprint(f), local_is(local), - domain_is(is), local_expr(lx), domain_handle(dh), domain_expr(dx), - handle(h), tree_id(tid), layout_id(l), use_event(use), redop(r), - piece_list(pl), piece_list_size(pl_size) + did(d), owner(own), point_space(points), footprint(f), local_expr(lx), + pending(p), handle(h), tree_id(tid), layout_id(l), use_event(use), + redop(r), piece_list(pl), piece_list_size(pl_size), source(src) //-------------------------------------------------------------------------- { - if (local_is) + if (local_expr != NULL) local_expr->add_base_expression_reference(META_TASK_REF); } @@ -3034,9 +3025,9 @@ namespace Legion { (const DeferCollectiveManagerArgs*)args; IndexSpaceNode *point_space = runtime->forest->get_node(dargs->point_space); - IndexSpaceExpression *inst_domain = dargs->local_is ? dargs->local_expr : - dargs->domain_is ? runtime->forest->get_node(dargs->domain_handle) : - runtime->forest->find_remote_expression(dargs->domain_expr); + IndexSpaceExpression *inst_domain = dargs->local_expr; + if (inst_domain == NULL) + inst_domain = runtime->forest->find_remote_expression(dargs->pending); FieldSpaceNode *space_node = runtime->forest->get_node(dargs->handle); LayoutConstraints *constraints = runtime->find_layout_constraints(dargs->layout_id); @@ -3045,7 +3036,7 @@ namespace Legion { dargs->piece_list_size, space_node, dargs->tree_id, constraints, dargs->use_event, dargs->redop); // Remove the local expression reference if necessary - if (dargs->local_is && + if ((dargs->local_expr != NULL) && dargs->local_expr->remove_base_expression_reference(META_TASK_REF)) delete dargs->local_expr; } @@ -3102,7 +3093,7 @@ namespace Legion { { RtUserEvent to_trigger; derez.deserialize(to_trigger); - LocalReferenceMutator mutator; + LocalReferenceMutator mutator(false/*waiter*/);; manager->activate_collective(&mutator); Runtime::trigger_event(to_trigger, mutator.get_done_event()); break; @@ -3111,7 +3102,7 @@ namespace Legion { { RtUserEvent to_trigger; derez.deserialize(to_trigger); - LocalReferenceMutator mutator; + LocalReferenceMutator mutator(false/*waiter*/); manager->deactivate_collective(&mutator); Runtime::trigger_event(to_trigger, mutator.get_done_event()); break; @@ -3120,7 +3111,7 @@ namespace Legion { { RtUserEvent to_trigger; derez.deserialize(to_trigger); - LocalReferenceMutator mutator; + LocalReferenceMutator mutator(false/*waiter*/); manager->validate_collective(&mutator); Runtime::trigger_event(to_trigger, mutator.get_done_event()); break; @@ -3129,7 +3120,7 @@ namespace Legion { { RtUserEvent to_trigger; derez.deserialize(to_trigger); - LocalReferenceMutator mutator; + LocalReferenceMutator mutator(false/*waiter*/); manager->invalidate_collective(&mutator); Runtime::trigger_event(to_trigger, mutator.get_done_event()); break; diff --git a/runtime/legion/legion_instances.h b/runtime/legion/legion_instances.h index 7815537193..6460916253 100644 --- a/runtime/legion/legion_instances.h +++ b/runtime/legion/legion_instances.h @@ -371,23 +371,19 @@ namespace Legion { static const LgTaskID TASK_ID = LG_DEFER_INDIVIDUAL_MANAGER_TASK_ID; public: DeferIndividualManagerArgs(DistributedID d, AddressSpaceID own, - Memory m, PhysicalInstance i, size_t f, bool local, - IndexSpaceExpression *lx, bool is, IndexSpace dh, - IndexSpaceExprID dx, FieldSpace h, RegionTreeID tid, - LayoutConstraintID l, ApEvent use, ReductionOpID redop, - const void *piece_list, size_t piece_list_size, - bool shadow_instance); + Memory m, PhysicalInstance i, size_t f, IndexSpaceExpression *lx, + const PendingRemoteExpression &pending, FieldSpace h, + RegionTreeID tid, LayoutConstraintID l, ApEvent use, + ReductionOpID redop, const void *piece_list, size_t piece_list_size, + AddressSpaceID src, bool shadow_instance); public: const DistributedID did; const AddressSpaceID owner; const Memory mem; const PhysicalInstance inst; const size_t footprint; - const bool local_is; - const bool domain_is; + const PendingRemoteExpression pending; IndexSpaceExpression *local_expr; - const IndexSpace domain_handle; - const IndexSpaceExprID domain_expr; const FieldSpace handle; const RegionTreeID tree_id; const LayoutConstraintID layout_id; @@ -395,6 +391,7 @@ namespace Legion { const ReductionOpID redop; const void *const piece_list; const size_t piece_list_size; + const AddressSpaceID source; const bool shadow_instance; }; public: @@ -527,20 +524,18 @@ namespace Legion { static const LgTaskID TASK_ID = LG_DEFER_COLLECTIVE_MANAGER_TASK_ID; public: DeferCollectiveManagerArgs(DistributedID d, AddressSpaceID own, - IndexSpace p, size_t f, bool local, IndexSpaceExpression *lx, - bool is, IndexSpace dh, IndexSpaceExprID dx, FieldSpace h, + IndexSpace p, size_t f, IndexSpaceExpression *lx, + const PendingRemoteExpression &pending, FieldSpace h, RegionTreeID tid, LayoutConstraintID l, ApEvent use, - ReductionOpID redop, const void *piece_list,size_t piece_list_size); + ReductionOpID redop, const void *piece_list, + size_t piece_list_size, AddressSpaceID source); public: const DistributedID did; const AddressSpaceID owner; IndexSpace point_space; const size_t footprint; - const bool local_is; - const bool domain_is; IndexSpaceExpression *const local_expr; - const IndexSpace domain_handle; - const IndexSpaceExprID domain_expr; + const PendingRemoteExpression pending; const FieldSpace handle; const RegionTreeID tree_id; const LayoutConstraintID layout_id; @@ -548,6 +543,7 @@ namespace Legion { const ReductionOpID redop; const void *const piece_list; const size_t piece_list_size; + const AddressSpaceID source; }; public: CollectiveManager(RegionTreeForest *ctx, DistributedID did, diff --git a/runtime/legion/legion_ops.h b/runtime/legion/legion_ops.h index 4f61311fc0..23fc1e8ea2 100644 --- a/runtime/legion/legion_ops.h +++ b/runtime/legion/legion_ops.h @@ -348,6 +348,7 @@ namespace Legion { const std::vector *dependences = NULL); public: // Inherited from ReferenceMutator + virtual bool is_waiting_mutator(void) const { return false; } virtual void record_reference_mutation_effect(RtEvent event); public: RtEvent execute_prepipeline_stage(GenerationID gen, @@ -617,7 +618,7 @@ namespace Legion { protected: static inline void add_launch_space_reference(IndexSpaceNode *node) { - LocalReferenceMutator mutator; + LocalReferenceMutator mutator(true/*waiter*/); node->add_base_valid_ref(CONTEXT_REF, &mutator); } static inline bool remove_launch_space_reference(IndexSpaceNode *node) diff --git a/runtime/legion/legion_tasks.cc b/runtime/legion/legion_tasks.cc index e7b3d69192..7d7b4f3345 100644 --- a/runtime/legion/legion_tasks.cc +++ b/runtime/legion/legion_tasks.cc @@ -9473,7 +9473,7 @@ namespace Legion { #ifdef DEBUG_LEGION assert(finder != future_handles->handles.end()); #endif - LocalReferenceMutator mutator; + LocalReferenceMutator mutator(false/*not waiting*/); FutureImpl *impl = runtime->find_or_create_future(finder->second, parent_ctx->get_context_uid(), &mutator); if (functor != NULL) diff --git a/runtime/legion/legion_types.h b/runtime/legion/legion_types.h index d152d4f767..cdfc372de4 100644 --- a/runtime/legion/legion_types.h +++ b/runtime/legion/legion_types.h @@ -1553,7 +1553,7 @@ namespace Legion { class Notifiable; class ReferenceMutator; class LocalReferenceMutator; - class NeverReferenceMutator; + class ImplicitReferenceTracker; class DistributedCollectable; class LayoutDescription; class InstanceManager; // base class for all instances @@ -1590,6 +1590,7 @@ namespace Legion { class TreeClose; struct CloseInfo; struct FieldDataDescriptor; + struct PendingRemoteExpression; // legion_spy.h class TreeStateLogger; @@ -1629,9 +1630,10 @@ namespace Legion { // This data structure tracks references to any live // temporary index space expressions that have been // handed back by the region tree inside the execution - // of a meta-task or a runtime API call - extern __thread - std::vector *implicit_live_expressions; + // of a meta-task or a runtime API call. It also tracks + // changes to remote distributed collectable that can be + // delayed and batched together. + extern __thread ImplicitReferenceTracker *implicit_reference_tracker; /** * \class LgTaskArgs @@ -2384,11 +2386,10 @@ namespace Legion { UniqueID local_provenance = Internal::implicit_provenance; // Save whether we are in a registration callback unsigned local_callback = Internal::inside_registration_callback; - // Save any local live expressions that we have - std::vector *local_live_expressions = - Internal::implicit_live_expressions; + // Save the reference tracker that we have + ImplicitReferenceTracker *local_tracker = implicit_reference_tracker; #ifdef DEBUG_LEGION - Internal::implicit_live_expressions = NULL; + Internal::implicit_reference_tracker = NULL; #endif // Check to see if we have any local locks to notify if (Internal::local_lock_list != NULL) @@ -2429,10 +2430,10 @@ namespace Legion { // Write the registration callback information back Internal::inside_registration_callback = local_callback; #ifdef DEBUG_LEGION - assert(Internal::implicit_live_expressions == NULL); + assert(Internal::implicit_reference_tracker == NULL); #endif - // Write the local live expressions back - Internal::implicit_live_expressions = local_live_expressions; + // Write the local reference tracker back + Internal::implicit_reference_tracker = local_tracker; } //-------------------------------------------------------------------------- @@ -2445,11 +2446,10 @@ namespace Legion { UniqueID local_provenance = Internal::implicit_provenance; // Save whether we are in a registration callback unsigned local_callback = Internal::inside_registration_callback; - // Save any local live expressions that we have - std::vector *local_live_expressions = - Internal::implicit_live_expressions; + // Save the reference tracker that we have + ImplicitReferenceTracker *local_tracker = implicit_reference_tracker; #ifdef DEBUG_LEGION - Internal::implicit_live_expressions = NULL; + Internal::implicit_reference_tracker = NULL; #endif // Check to see if we have any local locks to notify if (Internal::local_lock_list != NULL) @@ -2490,10 +2490,10 @@ namespace Legion { // Write the registration callback information back Internal::inside_registration_callback = local_callback; #ifdef DEBUG_LEGION - assert(Internal::implicit_live_expressions == NULL); + assert(Internal::implicit_reference_tracker == NULL); #endif - // Write the local live expressions back - Internal::implicit_live_expressions = local_live_expressions; + // Write the local reference tracker back + Internal::implicit_reference_tracker = local_tracker; } #ifdef LEGION_SPY diff --git a/runtime/legion/legion_views.cc b/runtime/legion/legion_views.cc index b1750141d7..4a6face762 100644 --- a/runtime/legion/legion_views.cc +++ b/runtime/legion/legion_views.cc @@ -1580,7 +1580,9 @@ namespace Legion { { const unsigned index = indexes.size(); rez.serialize(index); - it->first->pack_user(rez, target); + // No need for a reference since we're replicating so we know + // that we'll continue holding this reference + it->first->pack_user(rez, target, false/*need reference*/); indexes[it->first] = index; } else @@ -1621,7 +1623,9 @@ namespace Legion { { const unsigned index = indexes.size(); rez.serialize(index); - it->first->pack_user(rez, target); + // No need for a reference since we're replicating so we know + // that we'll continue holding the reference to keep it alive + it->first->pack_user(rez, target, false/*need reference*/); indexes[it->first] = index; } else @@ -1646,7 +1650,8 @@ namespace Legion { for (FieldMaskSet::const_iterator it = needed_subviews.begin(); it != needed_subviews.end(); it++) { - it->first->view_expr->pack_expression(rez, target); + // No need for the reference since we're replicating this + it->first->view_expr->pack_expression(rez, target, false/*need ref*/); rez.serialize(it->second); it->first->pack_replication(rez, indexes, it->second, target); } diff --git a/runtime/legion/region_tree.cc b/runtime/legion/region_tree.cc index 4eb323fd36..a8d096ae74 100644 --- a/runtime/legion/region_tree.cc +++ b/runtime/legion/region_tree.cc @@ -5631,15 +5631,15 @@ namespace Legion { // Add the live reference if (mutator == NULL) { - LocalReferenceMutator local_mutator; + LocalReferenceMutator local_mutator(true/*waiter*/); result->add_base_expression_reference(LIVE_EXPR_REF, &local_mutator); } else result->add_base_expression_reference(LIVE_EXPR_REF, mutator); // Save it in the implicit live expression references - if (implicit_live_expressions == NULL) - implicit_live_expressions = new std::vector(); - implicit_live_expressions->emplace_back(result); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); // Remove the gc reference that comes back from finding it in the tree if (result->remove_live_reference(REGION_TREE_REF)) assert(false); // should never hit this @@ -5671,7 +5671,7 @@ namespace Legion { } if (expressions.empty()) return *(exprs.begin()); - LocalReferenceMutator local_mutator; + LocalReferenceMutator local_mutator(true/*waiter*/); if (expressions.size() == 1) { IndexSpaceExpression *result = expressions.back(); @@ -5681,10 +5681,9 @@ namespace Legion { result->add_base_expression_reference(LIVE_EXPR_REF,&local_mutator); else result->add_base_expression_reference(LIVE_EXPR_REF, mutator); - if (implicit_live_expressions == NULL) - implicit_live_expressions = - new std::vector; - implicit_live_expressions->emplace_back(result); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); } return result; } @@ -5709,10 +5708,9 @@ namespace Legion { &local_mutator); else result->add_base_expression_reference(LIVE_EXPR_REF, mutator); - if (implicit_live_expressions == NULL) - implicit_live_expressions = - new std::vector; - implicit_live_expressions->emplace_back(result); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); } return expressions.back(); } @@ -5800,10 +5798,9 @@ namespace Legion { &local_mutator); else result->add_base_expression_reference(LIVE_EXPR_REF, mutator); - if (implicit_live_expressions == NULL) - implicit_live_expressions = - new std::vector(); - implicit_live_expressions->emplace_back(result); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); } // Remove the extra expression reference we added if (result->remove_base_expression_reference(REGION_TREE_REF)) @@ -5824,9 +5821,9 @@ namespace Legion { result->add_base_expression_reference(LIVE_EXPR_REF,&local_mutator); else result->add_base_expression_reference(LIVE_EXPR_REF, mutator); - if (implicit_live_expressions == NULL) - implicit_live_expressions = new std::vector(); - implicit_live_expressions->emplace_back(result); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); } // Remove the reference added by the trie traversal if (result->remove_live_reference(REGION_TREE_REF)) @@ -5953,15 +5950,15 @@ namespace Legion { // Add the live reference if (mutator == NULL) { - LocalReferenceMutator local_mutator; + LocalReferenceMutator local_mutator(true/*waiter*/); result->add_base_expression_reference(LIVE_EXPR_REF, &local_mutator); } else result->add_base_expression_reference(LIVE_EXPR_REF, mutator); // Save it in the implicit live expression references - if (implicit_live_expressions == NULL) - implicit_live_expressions = new std::vector(); - implicit_live_expressions->emplace_back(result); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); // Remove the gc reference that comes back with the trie traversal if (result->remove_live_reference(REGION_TREE_REF)) assert(false); // should never hit this @@ -5997,7 +5994,7 @@ namespace Legion { // remove duplicates std::vector::iterator last = std::unique(expressions.begin(), expressions.end()); - LocalReferenceMutator local_mutator; + LocalReferenceMutator local_mutator(true/*waiter*/); if (last != expressions.end()) { expressions.erase(last, expressions.end()); @@ -6014,10 +6011,9 @@ namespace Legion { &local_mutator); else result->add_base_expression_reference(LIVE_EXPR_REF, mutator); - if (implicit_live_expressions == NULL) - implicit_live_expressions = - new std::vector; - implicit_live_expressions->emplace_back(result); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); } return result; } @@ -6092,10 +6088,9 @@ namespace Legion { &local_mutator); else unique->add_base_expression_reference(LIVE_EXPR_REF, mutator); - if (implicit_live_expressions == NULL) - implicit_live_expressions = - new std::vector; - implicit_live_expressions->emplace_back(unique); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(unique); } // Remove references on all the things we no longer need for (std::set:: @@ -6133,10 +6128,9 @@ namespace Legion { &local_mutator); else result->add_base_expression_reference(LIVE_EXPR_REF, mutator); - if (implicit_live_expressions == NULL) - implicit_live_expressions = - new std::vector(); - implicit_live_expressions->emplace_back(result); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); } // Remove the extra expression reference we added if (result->remove_base_expression_reference(REGION_TREE_REF)) @@ -6157,9 +6151,9 @@ namespace Legion { result->add_base_expression_reference(LIVE_EXPR_REF,&local_mutator); else result->add_base_expression_reference(LIVE_EXPR_REF, mutator); - if (implicit_live_expressions == NULL) - implicit_live_expressions = new std::vector; - implicit_live_expressions->emplace_back(result); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); } // Remove the reference added by the trie traversal if (result->remove_live_reference(REGION_TREE_REF)) @@ -6355,14 +6349,14 @@ namespace Legion { } if (mutator == NULL) { - LocalReferenceMutator local_mutator; + LocalReferenceMutator local_mutator(true/*waiter*/); result->add_base_expression_reference(LIVE_EXPR_REF, &local_mutator); } else result->add_base_expression_reference(LIVE_EXPR_REF, mutator); - if (implicit_live_expressions == NULL) - implicit_live_expressions = new std::vector; - implicit_live_expressions->emplace_back(result); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); // Remove the gc reference that comes back from finding it in the tree if (result->remove_live_reference(REGION_TREE_REF)) assert(false); // should never hit this @@ -6516,8 +6510,9 @@ namespace Legion { } const AddressSpaceID owner = IndexSpaceExpression::get_owner_space(remote_expr_id, runtime); - if (owner == runtime->address_space) - return origin; +#ifdef DEBUG_LEGION + assert(owner != runtime->address_space); +#endif // Retake the lock in exclusive mode and see if we lost the race RtEvent wait_on; RtUserEvent request_event; @@ -6572,16 +6567,38 @@ namespace Legion { //-------------------------------------------------------------------------- IndexSpaceExpression* RegionTreeForest::find_remote_expression( - IndexSpaceExprID remote_expr_id) + const PendingRemoteExpression &pending) //-------------------------------------------------------------------------- { - AutoLock l_lock(lookup_is_op_lock, 1, false/*exclusive*/); - std::map::const_iterator - finder = remote_expressions.find(remote_expr_id); + if (pending.is_index_space) + return get_node(pending.handle); + IndexSpaceExpression *result = NULL; + { + AutoLock l_lock(lookup_is_op_lock, 1, false/*exclusive*/); + std::map::const_iterator + finder = remote_expressions.find(pending.remote_expr_id); #ifdef DEBUG_LEGION - assert(finder != remote_expressions.end()); + assert(finder != remote_expressions.end()); #endif - return finder->second; + result = finder->second; + } + if (pending.has_reference) + { +#ifdef DEBUG_LEGION + IndexSpaceOperation *op = dynamic_cast(result); + assert(op != NULL); +#else + IndexSpaceOperation *op = static_cast(result); +#endif + LocalReferenceMutator mutator(false/*waiter*/); + result->add_base_expression_reference(LIVE_EXPR_REF, &mutator); + op->send_remote_valid_decrement(pending.source, NULL/*mutator*/, + mutator.get_done_event()); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); + } + return result; } //-------------------------------------------------------------------------- @@ -6592,10 +6609,8 @@ namespace Legion { AutoLock l_lock(lookup_is_op_lock); std::map::iterator finder = remote_expressions.find(remote_expr_id); -#ifdef DEBUG_LEGION - assert(finder != remote_expressions.end()); -#endif - remote_expressions.erase(finder); + if (finder != remote_expressions.end()) + remote_expressions.erase(finder); } //-------------------------------------------------------------------------- @@ -6822,8 +6837,32 @@ namespace Legion { derez.deserialize(is_local); if (is_local) { + bool has_reference; + derez.deserialize(has_reference); IndexSpaceExpression *result; derez.deserialize(result); + if (has_reference) + { + if (source != forest->runtime->address_space) + { +#ifdef DEBUG_LEGION + IndexSpaceOperation *op = + dynamic_cast(result); + assert(op != NULL); +#else + IndexSpaceOperation *op = static_cast(result); +#endif + // Make this valid and then send the removal of the + // remote did expression + LocalReferenceMutator mutator(false/*waiter*/); + op->add_base_expression_reference(LIVE_EXPR_REF, &mutator); + op->send_remote_valid_decrement(source, NULL/*mutator*/, + mutator.get_done_event()); + } + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); + } return result; } bool is_index_space; @@ -6835,41 +6874,107 @@ namespace Legion { derez.deserialize(handle); return forest->get_node(handle); } + bool has_reference; + derez.deserialize(has_reference); IndexSpaceExprID remote_expr_id; derez.deserialize(remote_expr_id); IndexSpaceExpression *origin; derez.deserialize(origin); - return forest->find_or_request_remote_expression(remote_expr_id, origin); + IndexSpaceExpression *result = + forest->find_or_request_remote_expression(remote_expr_id, origin); + if (has_reference) + { +#ifdef DEBUG_LEGION + IndexSpaceOperation *op = dynamic_cast(result); + assert(op != NULL); +#else + IndexSpaceOperation *op = static_cast(result); +#endif + LocalReferenceMutator mutator(false/*waiter*/); + result->add_base_expression_reference(LIVE_EXPR_REF, &mutator); + op->send_remote_valid_decrement(source, NULL/*mutator*/, + mutator.get_done_event()); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); + } + return result; } //-------------------------------------------------------------------------- /*static*/ IndexSpaceExpression* IndexSpaceExpression::unpack_expression( - Deserializer &derez, RegionTreeForest *forest, - AddressSpaceID source, bool &is_local, - bool &is_index_space, IndexSpace &handle, - IndexSpaceExprID &remote_expr_id, RtEvent &wait_for) + Deserializer &derez, RegionTreeForest *forest, AddressSpaceID source, + PendingRemoteExpression &pending, RtEvent &wait_for) //-------------------------------------------------------------------------- { // Handle the special case where this is a local index space expression + bool is_local; derez.deserialize(is_local); if (is_local) { + derez.deserialize(pending.has_reference); IndexSpaceExpression *result; derez.deserialize(result); + if (pending.has_reference) + { + if (source != forest->runtime->address_space) + { +#ifdef DEBUG_LEGION + IndexSpaceOperation *op = + dynamic_cast(result); + assert(op != NULL); +#else + IndexSpaceOperation *op = static_cast(result); +#endif + // Make this valid and then send the removal of the + // remote did expression + LocalReferenceMutator mutator(false/*waiter*/); + op->add_base_expression_reference(LIVE_EXPR_REF, &mutator); + op->send_remote_valid_decrement(source, NULL/*mutator*/, + mutator.get_done_event()); + } + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); + } return result; } - derez.deserialize(is_index_space); + derez.deserialize(pending.is_index_space); // If this is an index space it is easy - if (is_index_space) + if (pending.is_index_space) { - derez.deserialize(handle); - return forest->get_node(handle, &wait_for); + derez.deserialize(pending.handle); + return forest->get_node(pending.handle, &wait_for); } - derez.deserialize(remote_expr_id); + derez.deserialize(pending.has_reference); + derez.deserialize(pending.remote_expr_id); IndexSpaceExpression *origin; derez.deserialize(origin); - return forest->find_or_request_remote_expression(remote_expr_id, - origin, &wait_for); + IndexSpaceExpression *result = + forest->find_or_request_remote_expression(pending.remote_expr_id, + origin, &wait_for); + if (result == NULL) + { + pending.source = source; + return result; + } + if (pending.has_reference) + { +#ifdef DEBUG_LEGION + IndexSpaceOperation *op = dynamic_cast(result); + assert(op != NULL); +#else + IndexSpaceOperation *op = static_cast(result); +#endif + LocalReferenceMutator mutator(false/*waiter*/); + result->add_base_expression_reference(LIVE_EXPR_REF, &mutator); + op->send_remote_valid_decrement(source, NULL/*mutator*/, + mutator.get_done_event()); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); + } + return result; } ///////////////////////////////////////////////////////////// @@ -7048,7 +7153,7 @@ namespace Legion { { if (mutator == NULL) { - LocalReferenceMutator local_mutator; + LocalReferenceMutator local_mutator(true/*waiter*/); add_base_valid_ref(source, &local_mutator, count); } else @@ -7071,7 +7176,7 @@ namespace Legion { { if (mutator == NULL) { - LocalReferenceMutator local_mutator; + LocalReferenceMutator local_mutator(true/*waiter*/); add_nested_valid_ref(source, &local_mutator, count); } else @@ -8573,7 +8678,7 @@ namespace Legion { // If this is above then we don't care about it if it // is not still valid bool remove_reference = false; - LocalReferenceMutator mutator; + LocalReferenceMutator mutator(true/*waiter*/); if (has_reference) { AutoLock n_lock(node_lock); @@ -8607,7 +8712,7 @@ namespace Legion { assert(record.node == this); #endif bool remove_reference = false; - LocalReferenceMutator mutator; + LocalReferenceMutator mutator(true/*waiter*/); { AutoLock n_lock(node_lock); { @@ -8657,7 +8762,6 @@ namespace Legion { //-------------------------------------------------------------------------- { bool remove_reference; - LocalReferenceMutator mutator; { AutoLock n_lock(node_lock); remove_reference = (--send_references == 0); @@ -8994,7 +9098,8 @@ namespace Legion { } //-------------------------------------------------------------------------- - void IndexSpaceNode::pack_expression(Serializer &rez, AddressSpaceID target) + void IndexSpaceNode::pack_expression(Serializer &rez, AddressSpaceID target, + bool need_reference) //-------------------------------------------------------------------------- { if (target != context->runtime->address_space) @@ -9006,7 +9111,10 @@ namespace Legion { else { rez.serialize(true/*local*/); + rez.serialize(need_reference); rez.serialize(this); + if (need_reference) + add_base_expression_reference(LIVE_EXPR_REF); } } @@ -9021,7 +9129,7 @@ namespace Legion { // This could be a performance bug since it will block if we // have to send a reference to a remote node, but that should // never actually happen - LocalReferenceMutator mutator; + LocalReferenceMutator mutator(true/*waiter*/); add_base_gc_ref(REMOTE_DID_REF, &mutator); } @@ -9060,7 +9168,7 @@ namespace Legion { { if (mutator == NULL) { - LocalReferenceMutator local_mutator; + LocalReferenceMutator local_mutator(true/*waiter*/); add_base_valid_ref(source, &local_mutator, count); } else @@ -9083,7 +9191,7 @@ namespace Legion { { if (mutator == NULL) { - LocalReferenceMutator local_mutator; + LocalReferenceMutator local_mutator(true/*waiter*/); add_nested_valid_ref(source, &local_mutator, count); } else diff --git a/runtime/legion/region_tree.h b/runtime/legion/region_tree.h index ddd6e93b3b..af32f82af3 100644 --- a/runtime/legion/region_tree.h +++ b/runtime/legion/region_tree.h @@ -43,7 +43,7 @@ namespace Legion { /** * \struct IndirectRecord - * A small helper calss for performing exchanges of + * A small helper class for performing exchanges of * instances for indirection copies */ struct IndirectRecord : public LegionHeapify { @@ -62,6 +62,24 @@ namespace Legion { Domain domain; }; + /** + * \struct PendingRemoteExpression + * A small helper class for passing arguments associated + * with deferred calls to unpack remote expressions + */ + struct PendingRemoteExpression { + public: + PendingRemoteExpression(void) + : handle(IndexSpace::NO_SPACE), remote_expr_id(0), + source(0), is_index_space(false), has_reference(false) { } + public: + IndexSpace handle; + IndexSpaceExprID remote_expr_id; + AddressSpaceID source; + bool is_index_space; + bool has_reference; + }; + /** * \class OperationCreator * A base class for handling the creation of index space operations @@ -877,7 +895,7 @@ namespace Legion { IndexSpaceExprID remote_expr_id, IndexSpaceExpression *origin, RtEvent *wait_for = NULL); IndexSpaceExpression* find_remote_expression( - IndexSpaceExprID remote_expr_id); + const PendingRemoteExpression &pending_expression); void unregister_remote_expression(IndexSpaceExprID remote_expr_id); void handle_remote_expression_request(Deserializer &derez, AddressSpaceID source); @@ -1110,7 +1128,8 @@ namespace Legion { virtual void tighten_index_space(void) = 0; virtual bool check_empty(void) = 0; virtual size_t get_volume(void) = 0; - virtual void pack_expression(Serializer &rez, AddressSpaceID target) = 0; + virtual void pack_expression(Serializer &rez, AddressSpaceID target, + bool need_reference = true) = 0; virtual void pack_expression_value(Serializer &rez, AddressSpaceID target) = 0; public: @@ -1328,11 +1347,10 @@ namespace Legion { std::set &expressions); public: static IndexSpaceExpression* unpack_expression(Deserializer &derez, - RegionTreeForest *forest, AddressSpaceID source); + RegionTreeForest *forest, AddressSpaceID source); static IndexSpaceExpression* unpack_expression(Deserializer &derez, RegionTreeForest *forest, AddressSpaceID source, - bool &is_local,bool &is_index_space,IndexSpace &handle, - IndexSpaceExprID &remote_expr_id, RtEvent &wait_for); + PendingRemoteExpression &pending, RtEvent &wait_for); public: const TypeTag type_tag; const IndexSpaceExprID expr_id; @@ -1360,7 +1378,7 @@ namespace Legion { { if (m == NULL) { - LocalReferenceMutator local_mutator; + LocalReferenceMutator local_mutator(true/*waiting*/); expr->add_base_expression_reference(LIVE_EXPR_REF, &local_mutator); } else @@ -1452,7 +1470,8 @@ namespace Legion { virtual void tighten_index_space(void) = 0; virtual bool check_empty(void) = 0; virtual size_t get_volume(void) = 0; - virtual void pack_expression(Serializer &rez, AddressSpaceID target) = 0; + virtual void pack_expression(Serializer &rez, AddressSpaceID target, + bool need_reference = true) = 0; virtual void pack_expression_value(Serializer &rez, AddressSpaceID target) = 0; public: @@ -1512,7 +1531,8 @@ namespace Legion { virtual void tighten_index_space(void); virtual bool check_empty(void); virtual size_t get_volume(void); - virtual void pack_expression(Serializer &rez, AddressSpaceID target); + virtual void pack_expression(Serializer &rez, AddressSpaceID target, + bool need_reference = true); virtual void pack_expression_value(Serializer &rez, AddressSpaceID target) = 0; virtual bool invalidate_operation(void) = 0; @@ -2080,7 +2100,8 @@ namespace Legion { virtual bool set_domain(const Domain &domain, AddressSpaceID space) = 0; virtual void tighten_index_space(void) = 0; virtual bool check_empty(void) = 0; - virtual void pack_expression(Serializer &rez, AddressSpaceID target); + virtual void pack_expression(Serializer &rez, AddressSpaceID target, + bool need_reference); virtual void pack_expression_value(Serializer &rez,AddressSpaceID target); public: #ifdef DEBUG_LEGION diff --git a/runtime/legion/region_tree.inl b/runtime/legion/region_tree.inl index 7c5ed0420c..d0c57c4c7f 100644 --- a/runtime/legion/region_tree.inl +++ b/runtime/legion/region_tree.inl @@ -1534,25 +1534,42 @@ namespace Legion { //-------------------------------------------------------------------------- template void IndexSpaceOperationT::pack_expression(Serializer &rez, - AddressSpaceID target) + AddressSpaceID target, bool need_reference) //-------------------------------------------------------------------------- { +#ifdef DEBUG_LEGION + assert(this->is_valid()); +#endif if (target == this->local_space) { rez.serialize(true/*local*/); + rez.serialize(need_reference); rez.serialize(this); + // Add a live expression reference to keep this live through the message + if (need_reference) + this->add_base_expression_reference(LIVE_EXPR_REF); } else if (target == this->owner_space) { rez.serialize(true/*local*/); + rez.serialize(need_reference); rez.serialize(origin_expr); + // Add a reference here that we'll remove after we've added a reference + // on the target space expression + if (need_reference) + this->add_base_expression_reference(REMOTE_DID_REF); } else { rez.serialize(false/*local*/); rez.serialize(false/*index space*/); + rez.serialize(need_reference); rez.serialize(expr_id); rez.serialize(origin_expr); + // Add a reference here that we'll remove after we've added a reference + // on the target space expression + if (need_reference) + this->add_base_expression_reference(REMOTE_DID_REF); } } @@ -2339,9 +2356,9 @@ namespace Legion { { // This is another kind of live expression made by the region tree this->add_base_expression_reference(LIVE_EXPR_REF); - if (implicit_live_expressions == NULL) - implicit_live_expressions = new std::vector; - implicit_live_expressions->emplace_back(this); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(this); #ifdef DEBUG_LEGION assert(num_rects > 0); #endif @@ -2534,8 +2551,7 @@ namespace Legion { void RemoteExpression::remove_operation(void) //-------------------------------------------------------------------------- { - // should never be called - assert(false); + // nothing to do here } ///////////////////////////////////////////////////////////// diff --git a/runtime/legion/runtime.cc b/runtime/legion/runtime.cc index 978b1c2706..18ba843816 100644 --- a/runtime/legion/runtime.cc +++ b/runtime/legion/runtime.cc @@ -75,7 +75,7 @@ namespace Legion { __thread AutoLock *local_lock_list = NULL; __thread UniqueID implicit_provenance = 0; __thread unsigned inside_registration_callback = NO_REGISTRATION_CALLBACK; - __thread std::vector *implicit_live_expressions=NULL; + __thread ImplicitReferenceTracker *implicit_reference_tracker = NULL; const LgEvent LgEvent::NO_LG_EVENT = LgEvent(); const ApEvent ApEvent::NO_AP_EVENT = ApEvent(); @@ -5176,7 +5176,6 @@ namespace Legion { //-------------------------------------------------------------------------- { bool remove_min_reference = false; - IgnoreReferenceMutator mutator; if (!is_owner) { RtUserEvent never_gc_wait; @@ -5255,14 +5254,14 @@ namespace Legion { bool remove_duplicate = false; if (success.load()) { - LocalReferenceMutator local_mutator; + LocalReferenceMutator local_mutator(true/*waiter*/); // Add our local reference manager->add_base_valid_ref(NEVER_GC_REF, &local_mutator); const RtEvent reference_effects = local_mutator.get_done_event(); manager->send_remote_valid_decrement(owner_space, NULL, reference_effects); if (reference_effects.exists()) - mutator.record_reference_mutation_effect(reference_effects); + local_mutator.record_reference_mutation_effect(reference_effects); // Then record it AutoLock m_lock(manager_lock); #ifdef DEBUG_LEGION @@ -5279,7 +5278,7 @@ namespace Legion { info.mapper_priorities[key] = LEGION_GC_NEVER_PRIORITY; } if (remove_duplicate && - manager->remove_base_valid_ref(NEVER_GC_REF, &mutator)) + manager->remove_base_valid_ref(NEVER_GC_REF)) delete manager; } } @@ -5288,7 +5287,7 @@ namespace Legion { // If this a max priority, try adding the reference beforehand, if // it fails then we know the instance is already deleted so whatever if ((priority == LEGION_GC_NEVER_PRIORITY) && - !manager->acquire_instance(NEVER_GC_REF, &mutator)) + !manager->acquire_instance(NEVER_GC_REF, NULL/*mutator*/)) return; // Do the update locally AutoLock m_lock(manager_lock); @@ -5363,8 +5362,7 @@ namespace Legion { } } } - if (remove_min_reference && - manager->remove_base_valid_ref(NEVER_GC_REF, &mutator)) + if (remove_min_reference && manager->remove_base_valid_ref(NEVER_GC_REF)) delete manager; } @@ -5971,7 +5969,7 @@ namespace Legion { // and then remove the remote DID if (acquire) { - LocalReferenceMutator local_mutator; + LocalReferenceMutator local_mutator(false/*waiter*/); manager->add_base_valid_ref(MAPPING_ACQUIRE_REF, &local_mutator); const RtEvent reference_effects = local_mutator.get_done_event(); manager->send_remote_valid_decrement(source, NULL, @@ -6105,7 +6103,7 @@ namespace Legion { // and then remove the remote DID if (acquire) { - LocalReferenceMutator local_mutator; + LocalReferenceMutator local_mutator(true/*waiter*/); manager->add_base_valid_ref(MAPPING_ACQUIRE_REF, &local_mutator); const RtEvent reference_effects = local_mutator.get_done_event(); manager->send_remote_valid_decrement(source, NULL, @@ -6325,7 +6323,7 @@ namespace Legion { (*target)[index] = true; PhysicalManager *manager; derez.deserialize(manager); - LocalReferenceMutator local_mutator; + LocalReferenceMutator local_mutator(false/*waiter*/); manager->add_base_valid_ref(MAPPING_ACQUIRE_REF, &local_mutator); const RtEvent reference_effects = local_mutator.get_done_event(); manager->send_remote_valid_decrement(source, NULL, reference_effects); @@ -13145,23 +13143,17 @@ namespace Legion { //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION - assert(implicit_live_expressions == NULL); + assert(implicit_reference_tracker == NULL); #endif if (ctx != DUMMY_CONTEXT) ctx->begin_runtime_call(); const bool result = forest->is_index_partition_disjoint(p); if (ctx != DUMMY_CONTEXT) ctx->end_runtime_call(); - else if (implicit_live_expressions != NULL) + else if (implicit_reference_tracker != NULL) { - // Remove references to any live index space expressions we have - for (std::vector::const_iterator it = - implicit_live_expressions->begin(); it != - implicit_live_expressions->end(); it++) - if ((*it)->remove_base_expression_reference(LIVE_EXPR_REF)) - delete (*it); - delete implicit_live_expressions; - implicit_live_expressions = NULL; + delete implicit_reference_tracker; + implicit_reference_tracker = NULL; } return result; } @@ -13171,19 +13163,13 @@ namespace Legion { //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION - assert(implicit_live_expressions == NULL); + assert(implicit_reference_tracker == NULL); #endif const bool result = forest->is_index_partition_disjoint(p); - if (implicit_live_expressions != NULL) + if (implicit_reference_tracker != NULL) { - // Remove references to any live index space expressions we have - for (std::vector::const_iterator it = - implicit_live_expressions->begin(); it != - implicit_live_expressions->end(); it++) - if ((*it)->remove_base_expression_reference(LIVE_EXPR_REF)) - delete (*it); - delete implicit_live_expressions; - implicit_live_expressions = NULL; + delete implicit_reference_tracker; + implicit_reference_tracker = NULL; } return result; } @@ -13193,23 +13179,17 @@ namespace Legion { //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION - assert(implicit_live_expressions == NULL); + assert(implicit_reference_tracker == NULL); #endif if (ctx != DUMMY_CONTEXT) ctx->begin_runtime_call(); bool result = forest->is_index_partition_complete(p); if (ctx != DUMMY_CONTEXT) ctx->end_runtime_call(); - else if (implicit_live_expressions != NULL) + else if (implicit_reference_tracker != NULL) { - // Remove references to any live index space expressions we have - for (std::vector::const_iterator it = - implicit_live_expressions->begin(); it != - implicit_live_expressions->end(); it++) - if ((*it)->remove_base_expression_reference(LIVE_EXPR_REF)) - delete (*it); - delete implicit_live_expressions; - implicit_live_expressions = NULL; + delete implicit_reference_tracker; + implicit_reference_tracker = NULL; } return result; } @@ -13219,19 +13199,13 @@ namespace Legion { //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION - assert(implicit_live_expressions == NULL); + assert(implicit_reference_tracker == NULL); #endif const bool result = forest->is_index_partition_complete(p); - if (implicit_live_expressions != NULL) + if (implicit_reference_tracker != NULL) { - // Remove references to any live index space expressions we have - for (std::vector::const_iterator it = - implicit_live_expressions->begin(); it != - implicit_live_expressions->end(); it++) - if ((*it)->remove_base_expression_reference(LIVE_EXPR_REF)) - delete (*it); - delete implicit_live_expressions; - implicit_live_expressions = NULL; + delete implicit_reference_tracker; + implicit_reference_tracker = NULL; } return result; } @@ -24429,7 +24403,7 @@ namespace Legion { if (!runtime->local_utils.empty()) assert(implicit_context == NULL); // this better hold #endif - assert(implicit_live_expressions == NULL); + assert(implicit_reference_tracker == NULL); #endif implicit_runtime = runtime; // We immediately bump the priority of all meta-tasks once they start @@ -25060,16 +25034,10 @@ namespace Legion { default: assert(false); // should never get here } - if (implicit_live_expressions != NULL) + if (implicit_reference_tracker != NULL) { - // Remove references to any live index space expressions we have - for (std::vector::const_iterator it = - implicit_live_expressions->begin(); it != - implicit_live_expressions->end(); it++) - if ((*it)->remove_base_expression_reference(LIVE_EXPR_REF)) - delete (*it); - delete implicit_live_expressions; - implicit_live_expressions = NULL; + delete implicit_reference_tracker; + implicit_reference_tracker = NULL; } #ifdef DEBUG_LEGION if (tid < LG_BEGIN_SHUTDOWN_TASK_IDS) @@ -25148,7 +25116,7 @@ namespace Legion { Runtime *runtime = *((Runtime**)userdata); #ifdef DEBUG_LEGION assert(userlen == sizeof(Runtime**)); - assert(implicit_live_expressions == NULL); + assert(implicit_reference_tracker == NULL); #endif implicit_runtime = runtime; // We immediately bump the priority of all meta-tasks once they start @@ -25188,16 +25156,10 @@ namespace Legion { default: assert(false); // should never get here } - if (implicit_live_expressions != NULL) + if (implicit_reference_tracker != NULL) { - // Remove references to any live index space expressions we have - for (std::vector::const_iterator it = - implicit_live_expressions->begin(); it != - implicit_live_expressions->end(); it++) - if ((*it)->remove_base_expression_reference(LIVE_EXPR_REF)) - delete (*it); - delete implicit_live_expressions; - implicit_live_expressions = NULL; + delete implicit_reference_tracker; + implicit_reference_tracker = NULL; } #ifdef DEBUG_LEGION runtime->decrement_total_outstanding_tasks(tid, true/*meta*/); From 9cb66ba80bba622ccaab950e6bcffcdfc32caeba Mon Sep 17 00:00:00 2001 From: Mike Bauer Date: Mon, 13 Dec 2021 19:49:51 -0800 Subject: [PATCH 21/36] legion: more work on distributed reference counting for expressions --- runtime/legion/legion_analysis.cc | 9 +- runtime/legion/legion_analysis.h | 3 +- runtime/legion/legion_instances.cc | 4 +- runtime/legion/legion_types.h | 4 - runtime/legion/legion_views.cc | 11 +- runtime/legion/region_tree.cc | 264 +++++++++++++++++++---------- runtime/legion/region_tree.h | 18 +- runtime/legion/region_tree.inl | 15 +- 8 files changed, 192 insertions(+), 136 deletions(-) diff --git a/runtime/legion/legion_analysis.cc b/runtime/legion/legion_analysis.cc index a929dbae3f..ac0b07211b 100644 --- a/runtime/legion/legion_analysis.cc +++ b/runtime/legion/legion_analysis.cc @@ -130,8 +130,8 @@ namespace Legion { } //-------------------------------------------------------------------------- - void PhysicalUser::pack_user(Serializer &rez, const AddressSpaceID target, - bool need_reference) const + void PhysicalUser::pack_user(Serializer &rez, + const AddressSpaceID target) const //-------------------------------------------------------------------------- { RezCheck z(rez); @@ -139,7 +139,7 @@ namespace Legion { rez.serialize(collect_event); #endif rez.serialize(usage); - expr->pack_expression(rez, target, need_reference); + expr->pack_expression(rez, target); rez.serialize(op_id); rez.serialize(index); rez.serialize(copy_user); @@ -13558,8 +13558,7 @@ namespace Legion { else rez.serialize(logical_owner_space); } - // No need for a reference here since we know we'll continue holding it - set_expr->pack_expression(rez, target, false/*need reference*/); + set_expr->pack_expression(rez, target); if (index_space_node != NULL) rez.serialize(index_space_node->handle); else diff --git a/runtime/legion/legion_analysis.h b/runtime/legion/legion_analysis.h index 116d605d78..26015fd7cf 100644 --- a/runtime/legion/legion_analysis.h +++ b/runtime/legion/legion_analysis.h @@ -705,8 +705,7 @@ namespace Legion { public: PhysicalUser& operator=(const PhysicalUser &rhs); public: - void pack_user(Serializer &rez, const AddressSpaceID target, - bool need_reference = true) const; + void pack_user(Serializer &rez, const AddressSpaceID target) const; static PhysicalUser* unpack_user(Deserializer &derez, RegionTreeForest *forest, const AddressSpaceID source); public: diff --git a/runtime/legion/legion_instances.cc b/runtime/legion/legion_instances.cc index 83be9c6656..f2dffa957d 100644 --- a/runtime/legion/legion_instances.cc +++ b/runtime/legion/legion_instances.cc @@ -1480,7 +1480,7 @@ namespace Legion { rez.serialize(instance); rez.serialize(instance_footprint); // No need for a reference here since we know we'll continue holding it - instance_domain->pack_expression(rez, target, false/*need reference*/); + instance_domain->pack_expression(rez, target); rez.serialize(piece_list_size); if (piece_list_size > 0) rez.serialize(piece_list, piece_list_size); @@ -2902,7 +2902,7 @@ namespace Legion { rez.serialize(point_space->handle); rez.serialize(instance_footprint); // No need for a reference here since we know we'll continue holding it - instance_domain->pack_expression(rez, target, false/*need reference*/); + instance_domain->pack_expression(rez, target); rez.serialize(field_space_node->handle); rez.serialize(tree_id); rez.serialize(redop); diff --git a/runtime/legion/legion_types.h b/runtime/legion/legion_types.h index cdfc372de4..1eff2f0a1a 100644 --- a/runtime/legion/legion_types.h +++ b/runtime/legion/legion_types.h @@ -2388,9 +2388,7 @@ namespace Legion { unsigned local_callback = Internal::inside_registration_callback; // Save the reference tracker that we have ImplicitReferenceTracker *local_tracker = implicit_reference_tracker; -#ifdef DEBUG_LEGION Internal::implicit_reference_tracker = NULL; -#endif // Check to see if we have any local locks to notify if (Internal::local_lock_list != NULL) { @@ -2448,9 +2446,7 @@ namespace Legion { unsigned local_callback = Internal::inside_registration_callback; // Save the reference tracker that we have ImplicitReferenceTracker *local_tracker = implicit_reference_tracker; -#ifdef DEBUG_LEGION Internal::implicit_reference_tracker = NULL; -#endif // Check to see if we have any local locks to notify if (Internal::local_lock_list != NULL) { diff --git a/runtime/legion/legion_views.cc b/runtime/legion/legion_views.cc index 4a6face762..b1750141d7 100644 --- a/runtime/legion/legion_views.cc +++ b/runtime/legion/legion_views.cc @@ -1580,9 +1580,7 @@ namespace Legion { { const unsigned index = indexes.size(); rez.serialize(index); - // No need for a reference since we're replicating so we know - // that we'll continue holding this reference - it->first->pack_user(rez, target, false/*need reference*/); + it->first->pack_user(rez, target); indexes[it->first] = index; } else @@ -1623,9 +1621,7 @@ namespace Legion { { const unsigned index = indexes.size(); rez.serialize(index); - // No need for a reference since we're replicating so we know - // that we'll continue holding the reference to keep it alive - it->first->pack_user(rez, target, false/*need reference*/); + it->first->pack_user(rez, target); indexes[it->first] = index; } else @@ -1650,8 +1646,7 @@ namespace Legion { for (FieldMaskSet::const_iterator it = needed_subviews.begin(); it != needed_subviews.end(); it++) { - // No need for the reference since we're replicating this - it->first->view_expr->pack_expression(rez, target, false/*need ref*/); + it->first->view_expr->pack_expression(rez, target); rez.serialize(it->second); it->first->pack_replication(rez, indexes, it->second, target); } diff --git a/runtime/legion/region_tree.cc b/runtime/legion/region_tree.cc index a8d096ae74..493fcda22d 100644 --- a/runtime/legion/region_tree.cc +++ b/runtime/legion/region_tree.cc @@ -6571,19 +6571,38 @@ namespace Legion { //-------------------------------------------------------------------------- { if (pending.is_index_space) - return get_node(pending.handle); - IndexSpaceExpression *result = NULL; { - AutoLock l_lock(lookup_is_op_lock, 1, false/*exclusive*/); - std::map::const_iterator - finder = remote_expressions.find(pending.remote_expr_id); -#ifdef DEBUG_LEGION - assert(finder != remote_expressions.end()); -#endif - result = finder->second; + IndexSpaceNode *node = get_node(pending.handle); + LocalReferenceMutator mutator(false/*waiter*/); + node->add_base_expression_reference(LIVE_EXPR_REF, &mutator); + const RtEvent added = mutator.get_done_event(); + // Special case here: if the source was the owner and we didn't + // just send a message to add our reference then we can buffer up + // our reference to be removed when we are no longer valid + // Be very careful here! You can only do this if the expression + // was sent from the source or you risk reference count cycles! + if ((pending.source == node->owner_space) && + (!added.exists() || added.has_triggered())) + node->record_remote_owner_valid_reference(); + else + node->send_remote_valid_decrement(pending.source, NULL/*mut*/, added); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(node); + return node; } - if (pending.has_reference) + else { + IndexSpaceExpression *result = NULL; + { + AutoLock l_lock(lookup_is_op_lock, 1, false/*exclusive*/); + std::map::const_iterator + finder = remote_expressions.find(pending.remote_expr_id); +#ifdef DEBUG_LEGION + assert(finder != remote_expressions.end()); +#endif + result = finder->second; + } #ifdef DEBUG_LEGION IndexSpaceOperation *op = dynamic_cast(result); assert(op != NULL); @@ -6592,13 +6611,22 @@ namespace Legion { #endif LocalReferenceMutator mutator(false/*waiter*/); result->add_base_expression_reference(LIVE_EXPR_REF, &mutator); - op->send_remote_valid_decrement(pending.source, NULL/*mutator*/, - mutator.get_done_event()); + const RtEvent added = mutator.get_done_event(); + // Special case here: if the source was the owner and we didn't + // just send a message to add our reference then we can buffer up + // our reference to be removed when we are no longer valid + // Be very careful here! You can only do this if the expression + // was sent from the source or you risk reference count cycles! + if ((pending.source == op->owner_space) && + (!added.exists() || added.has_triggered())) + result->record_remote_owner_valid_reference(); + else + op->send_remote_valid_decrement(pending.source,NULL/*mutator*/,added); if (implicit_reference_tracker == NULL) implicit_reference_tracker = new ImplicitReferenceTracker; implicit_reference_tracker->record_live_expression(result); + return result; } - return result; } //-------------------------------------------------------------------------- @@ -6693,8 +6721,9 @@ namespace Legion { //-------------------------------------------------------------------------- IndexSpaceExpression::IndexSpaceExpression(LocalLock &lock) - : type_tag(0), expr_id(0), expr_lock(lock), canonical(NULL), volume(0), - has_volume(false), empty(false), has_empty(false) + : type_tag(0), expr_id(0), expr_lock(lock), canonical(NULL), + remote_owner_valid_references(0), + volume(0), has_volume(false), empty(false), has_empty(false) //-------------------------------------------------------------------------- { } @@ -6703,8 +6732,8 @@ namespace Legion { IndexSpaceExpression::IndexSpaceExpression(TypeTag tag, Runtime *rt, LocalLock &lock) : type_tag(tag), expr_id(rt->get_unique_index_space_expr_id()), - expr_lock(lock), canonical(NULL), volume(0), has_volume(false), - empty(false), has_empty(false) + expr_lock(lock), canonical(NULL), remote_owner_valid_references(0), + volume(0), has_volume(false), empty(false), has_empty(false) //-------------------------------------------------------------------------- { } @@ -6712,8 +6741,9 @@ namespace Legion { //-------------------------------------------------------------------------- IndexSpaceExpression::IndexSpaceExpression(TypeTag tag, IndexSpaceExprID id, LocalLock &lock) - : type_tag(tag), expr_id(id), expr_lock(lock), canonical(NULL), volume(0), - has_volume(false), empty(false), has_empty(false) + : type_tag(tag), expr_id(id), expr_lock(lock), canonical(NULL), + remote_owner_valid_references(0), + volume(0), has_volume(false), empty(false), has_empty(false) //-------------------------------------------------------------------------- { } @@ -6837,32 +6867,29 @@ namespace Legion { derez.deserialize(is_local); if (is_local) { - bool has_reference; - derez.deserialize(has_reference); IndexSpaceExpression *result; derez.deserialize(result); - if (has_reference) + if (source != forest->runtime->address_space) { - if (source != forest->runtime->address_space) - { #ifdef DEBUG_LEGION - IndexSpaceOperation *op = - dynamic_cast(result); - assert(op != NULL); + IndexSpaceOperation *op = + dynamic_cast(result); + assert(op != NULL); #else - IndexSpaceOperation *op = static_cast(result); -#endif - // Make this valid and then send the removal of the - // remote did expression - LocalReferenceMutator mutator(false/*waiter*/); - op->add_base_expression_reference(LIVE_EXPR_REF, &mutator); - op->send_remote_valid_decrement(source, NULL/*mutator*/, - mutator.get_done_event()); - } - if (implicit_reference_tracker == NULL) - implicit_reference_tracker = new ImplicitReferenceTracker; - implicit_reference_tracker->record_live_expression(result); + IndexSpaceOperation *op = static_cast(result); +#endif + // Make this valid and then send the removal of the + // remote did expression + LocalReferenceMutator mutator(false/*waiter*/); + op->add_base_expression_reference(LIVE_EXPR_REF, &mutator); + // Always need to send this reference removal back immediately + // in order to avoid reference counting deadlock + op->send_remote_valid_decrement(source, NULL/*mutator*/, + mutator.get_done_event()); } + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); return result; } bool is_index_space; @@ -6872,18 +6899,33 @@ namespace Legion { { IndexSpace handle; derez.deserialize(handle); - return forest->get_node(handle); + IndexSpaceNode *node = forest->get_node(handle); + LocalReferenceMutator mutator(false/*waiter*/); + node->add_base_expression_reference(LIVE_EXPR_REF, &mutator); + const RtEvent added = mutator.get_done_event(); + // Special case here: if the source was the owner and we didn't + // just send a message to add our reference then we can buffer up + // our reference to be removed when we are no longer valid + // Be very careful here! You can only do this if the expression + // was sent from the source or you risk reference count cycles! + if ((source == node->owner_space) && + (!added.exists() || added.has_triggered())) + node->record_remote_owner_valid_reference(); + else + node->send_remote_valid_decrement(source, NULL/*mutator*/, added); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(node); + return node; } - bool has_reference; - derez.deserialize(has_reference); - IndexSpaceExprID remote_expr_id; - derez.deserialize(remote_expr_id); - IndexSpaceExpression *origin; - derez.deserialize(origin); - IndexSpaceExpression *result = - forest->find_or_request_remote_expression(remote_expr_id, origin); - if (has_reference) + else { + IndexSpaceExprID remote_expr_id; + derez.deserialize(remote_expr_id); + IndexSpaceExpression *origin; + derez.deserialize(origin); + IndexSpaceExpression *result = + forest->find_or_request_remote_expression(remote_expr_id, origin); #ifdef DEBUG_LEGION IndexSpaceOperation *op = dynamic_cast(result); assert(op != NULL); @@ -6892,13 +6934,22 @@ namespace Legion { #endif LocalReferenceMutator mutator(false/*waiter*/); result->add_base_expression_reference(LIVE_EXPR_REF, &mutator); - op->send_remote_valid_decrement(source, NULL/*mutator*/, - mutator.get_done_event()); + const RtEvent added = mutator.get_done_event(); + // Special case here: if the source was the owner and we didn't + // just send a message to add our reference then we can buffer up + // our reference to be removed when we are no longer valid + // Be very careful here! You can only do this if the expression + // was sent from the source or you risk reference count cycles! + if ((source == op->owner_space) && + (!added.exists() || added.has_triggered())) + result->record_remote_owner_valid_reference(); + else + op->send_remote_valid_decrement(source, NULL/*mutator*/, added); if (implicit_reference_tracker == NULL) implicit_reference_tracker = new ImplicitReferenceTracker; implicit_reference_tracker->record_live_expression(result); + return result; } - return result; } //-------------------------------------------------------------------------- @@ -6912,31 +6963,29 @@ namespace Legion { derez.deserialize(is_local); if (is_local) { - derez.deserialize(pending.has_reference); IndexSpaceExpression *result; derez.deserialize(result); - if (pending.has_reference) + if (source != forest->runtime->address_space) { - if (source != forest->runtime->address_space) - { #ifdef DEBUG_LEGION - IndexSpaceOperation *op = - dynamic_cast(result); - assert(op != NULL); + IndexSpaceOperation *op = + dynamic_cast(result); + assert(op != NULL); #else - IndexSpaceOperation *op = static_cast(result); -#endif - // Make this valid and then send the removal of the - // remote did expression - LocalReferenceMutator mutator(false/*waiter*/); - op->add_base_expression_reference(LIVE_EXPR_REF, &mutator); - op->send_remote_valid_decrement(source, NULL/*mutator*/, - mutator.get_done_event()); - } - if (implicit_reference_tracker == NULL) - implicit_reference_tracker = new ImplicitReferenceTracker; - implicit_reference_tracker->record_live_expression(result); + IndexSpaceOperation *op = static_cast(result); +#endif + // Make this valid and then send the removal of the + // remote did expression + LocalReferenceMutator mutator(false/*waiter*/); + op->add_base_expression_reference(LIVE_EXPR_REF, &mutator); + // Always need to send this reference removal back immediately + // in order to avoid reference counting deadlock + op->send_remote_valid_decrement(source, NULL/*mutator*/, + mutator.get_done_event()); } + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); return result; } derez.deserialize(pending.is_index_space); @@ -6944,9 +6993,30 @@ namespace Legion { if (pending.is_index_space) { derez.deserialize(pending.handle); - return forest->get_node(pending.handle, &wait_for); + IndexSpaceNode *node = forest->get_node(pending.handle, &wait_for); + if (node == NULL) + { + pending.source = source; + return node; + } + LocalReferenceMutator mutator(false/*waiter*/); + node->add_base_expression_reference(LIVE_EXPR_REF, &mutator); + const RtEvent added = mutator.get_done_event(); + // Special case here: if the source was the owner and we didn't + // just send a message to add our reference then we can buffer up + // our reference to be removed when we are no longer valid + // Be very careful here! You can only do this if the expression + // was sent from the source or you risk reference count cycles! + if ((source == node->owner_space) && + (!added.exists() || added.has_triggered())) + node->record_remote_owner_valid_reference(); + else + node->send_remote_valid_decrement(source, NULL/*mutator*/, added); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(node); + return node; } - derez.deserialize(pending.has_reference); derez.deserialize(pending.remote_expr_id); IndexSpaceExpression *origin; derez.deserialize(origin); @@ -6958,22 +7028,28 @@ namespace Legion { pending.source = source; return result; } - if (pending.has_reference) - { #ifdef DEBUG_LEGION - IndexSpaceOperation *op = dynamic_cast(result); - assert(op != NULL); + IndexSpaceOperation *op = dynamic_cast(result); + assert(op != NULL); #else - IndexSpaceOperation *op = static_cast(result); -#endif - LocalReferenceMutator mutator(false/*waiter*/); - result->add_base_expression_reference(LIVE_EXPR_REF, &mutator); - op->send_remote_valid_decrement(source, NULL/*mutator*/, - mutator.get_done_event()); - if (implicit_reference_tracker == NULL) - implicit_reference_tracker = new ImplicitReferenceTracker; - implicit_reference_tracker->record_live_expression(result); - } + IndexSpaceOperation *op = static_cast(result); +#endif + LocalReferenceMutator mutator(false/*waiter*/); + result->add_base_expression_reference(LIVE_EXPR_REF, &mutator); + const RtEvent added = mutator.get_done_event(); + // Special case here: if the source was the owner and we didn't + // just send a message to add our reference then we can buffer up + // our reference to be removed when we are no longer valid + // Be very careful here! You can only do this if the expression + // was sent from the source or you risk reference count cycles! + if ((source == op->owner_space) && + (!added.exists() || added.has_triggered())) + result->record_remote_owner_valid_reference(); + else + op->send_remote_valid_decrement(source, NULL/*mutator*/, added); + if (implicit_reference_tracker == NULL) + implicit_reference_tracker = new ImplicitReferenceTracker; + implicit_reference_tracker->record_live_expression(result); return result; } @@ -7064,7 +7140,8 @@ namespace Legion { //-------------------------------------------------------------------------- { if (!is_owner()) - send_remote_valid_decrement(owner_space, mutator); + send_remote_valid_decrement(owner_space, mutator, + RtEvent::NO_RT_EVENT, remote_owner_valid_references.exchange(0) + 1); // If we have a canonical reference that is not ourselves then // we need to remove the nested reference that we are holding on it too if ((canonical != NULL) && (canonical != this) && @@ -7938,7 +8015,8 @@ namespace Legion { tree_valid = false; } else - send_remote_valid_decrement(owner_space, mutator); + send_remote_valid_decrement(owner_space, mutator, + RtEvent::NO_RT_EVENT, remote_owner_valid_references.exchange(0) + 1); // If we have a canonical reference that is not ourselves then // we need to remove the nested reference that we are holding on it too if ((canonical != NULL) && (canonical != this) && @@ -9098,8 +9176,7 @@ namespace Legion { } //-------------------------------------------------------------------------- - void IndexSpaceNode::pack_expression(Serializer &rez, AddressSpaceID target, - bool need_reference) + void IndexSpaceNode::pack_expression(Serializer &rez, AddressSpaceID target) //-------------------------------------------------------------------------- { if (target != context->runtime->address_space) @@ -9107,14 +9184,13 @@ namespace Legion { rez.serialize(false/*local*/); rez.serialize(true/*index space*/); rez.serialize(handle); + add_base_expression_reference(REMOTE_DID_REF); } else { rez.serialize(true/*local*/); - rez.serialize(need_reference); rez.serialize(this); - if (need_reference) - add_base_expression_reference(LIVE_EXPR_REF); + add_base_expression_reference(LIVE_EXPR_REF); } } diff --git a/runtime/legion/region_tree.h b/runtime/legion/region_tree.h index af32f82af3..261ae001e7 100644 --- a/runtime/legion/region_tree.h +++ b/runtime/legion/region_tree.h @@ -71,13 +71,12 @@ namespace Legion { public: PendingRemoteExpression(void) : handle(IndexSpace::NO_SPACE), remote_expr_id(0), - source(0), is_index_space(false), has_reference(false) { } + source(0), is_index_space(false) { } public: IndexSpace handle; IndexSpaceExprID remote_expr_id; AddressSpaceID source; bool is_index_space; - bool has_reference; }; /** @@ -1128,8 +1127,7 @@ namespace Legion { virtual void tighten_index_space(void) = 0; virtual bool check_empty(void) = 0; virtual size_t get_volume(void) = 0; - virtual void pack_expression(Serializer &rez, AddressSpaceID target, - bool need_reference = true) = 0; + virtual void pack_expression(Serializer &rez, AddressSpaceID target) = 0; virtual void pack_expression_value(Serializer &rez, AddressSpaceID target) = 0; public: @@ -1251,6 +1249,8 @@ namespace Legion { } inline size_t get_num_dims(void) const { return NT_TemplateHelper::get_dim(type_tag); } + inline void record_remote_owner_valid_reference(void) + { remote_owner_valid_references.fetch_add(1); } public: // Convert this index space expression to the canonical one that // represents all expressions that are all congruent @@ -1359,6 +1359,7 @@ namespace Legion { protected: std::set derived_operations; IndexSpaceExpression *canonical; + std::atomic remote_owner_valid_references; size_t volume; bool has_volume; bool empty, has_empty; @@ -1470,8 +1471,7 @@ namespace Legion { virtual void tighten_index_space(void) = 0; virtual bool check_empty(void) = 0; virtual size_t get_volume(void) = 0; - virtual void pack_expression(Serializer &rez, AddressSpaceID target, - bool need_reference = true) = 0; + virtual void pack_expression(Serializer &rez, AddressSpaceID target) = 0; virtual void pack_expression_value(Serializer &rez, AddressSpaceID target) = 0; public: @@ -1531,8 +1531,7 @@ namespace Legion { virtual void tighten_index_space(void); virtual bool check_empty(void); virtual size_t get_volume(void); - virtual void pack_expression(Serializer &rez, AddressSpaceID target, - bool need_reference = true); + virtual void pack_expression(Serializer &rez, AddressSpaceID target); virtual void pack_expression_value(Serializer &rez, AddressSpaceID target) = 0; virtual bool invalidate_operation(void) = 0; @@ -2100,8 +2099,7 @@ namespace Legion { virtual bool set_domain(const Domain &domain, AddressSpaceID space) = 0; virtual void tighten_index_space(void) = 0; virtual bool check_empty(void) = 0; - virtual void pack_expression(Serializer &rez, AddressSpaceID target, - bool need_reference); + virtual void pack_expression(Serializer &rez, AddressSpaceID target); virtual void pack_expression_value(Serializer &rez,AddressSpaceID target); public: #ifdef DEBUG_LEGION diff --git a/runtime/legion/region_tree.inl b/runtime/legion/region_tree.inl index d0c57c4c7f..6325a6e2a7 100644 --- a/runtime/legion/region_tree.inl +++ b/runtime/legion/region_tree.inl @@ -1534,7 +1534,7 @@ namespace Legion { //-------------------------------------------------------------------------- template void IndexSpaceOperationT::pack_expression(Serializer &rez, - AddressSpaceID target, bool need_reference) + AddressSpaceID target) //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION @@ -1543,33 +1543,26 @@ namespace Legion { if (target == this->local_space) { rez.serialize(true/*local*/); - rez.serialize(need_reference); rez.serialize(this); - // Add a live expression reference to keep this live through the message - if (need_reference) - this->add_base_expression_reference(LIVE_EXPR_REF); + this->add_base_expression_reference(LIVE_EXPR_REF); } else if (target == this->owner_space) { rez.serialize(true/*local*/); - rez.serialize(need_reference); rez.serialize(origin_expr); // Add a reference here that we'll remove after we've added a reference // on the target space expression - if (need_reference) - this->add_base_expression_reference(REMOTE_DID_REF); + this->add_base_expression_reference(REMOTE_DID_REF); } else { rez.serialize(false/*local*/); rez.serialize(false/*index space*/); - rez.serialize(need_reference); rez.serialize(expr_id); rez.serialize(origin_expr); // Add a reference here that we'll remove after we've added a reference // on the target space expression - if (need_reference) - this->add_base_expression_reference(REMOTE_DID_REF); + this->add_base_expression_reference(REMOTE_DID_REF); } } From 896fca3d143a41999c15c330f00dae5a3dffb34e Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Tue, 10 Aug 2021 15:19:05 -0700 Subject: [PATCH 22/36] realm: remove duplicate member variable --- runtime/realm/transfer/ib_memory.h | 1 - 1 file changed, 1 deletion(-) diff --git a/runtime/realm/transfer/ib_memory.h b/runtime/realm/transfer/ib_memory.h index bce04fd059..2c7c392c73 100644 --- a/runtime/realm/transfer/ib_memory.h +++ b/runtime/realm/transfer/ib_memory.h @@ -73,7 +73,6 @@ namespace Realm { Mutex mutex; // protection for resizing vectors std::map free_blocks; char *base; - NetworkSegment *segment; PendingIBRequests *ibreq_head; PendingIBRequests **ibreq_tail; }; From ef6fc594516f38c27b64a78f2532c3e471dc562b Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Fri, 6 Aug 2021 11:25:08 -0700 Subject: [PATCH 23/36] realm: ask channels to estimate cost of dma hops --- runtime/realm/cuda/cuda_internal.cc | 138 ++++++++------ runtime/realm/cuda/cuda_internal.h | 34 ++-- runtime/realm/hdf5/hdf5_internal.cc | 11 +- runtime/realm/transfer/channel.cc | 249 +++++++++++++++++-------- runtime/realm/transfer/channel.h | 116 +++++++----- runtime/realm/transfer/channel_disk.cc | 18 +- runtime/realm/transfer/lowlevel_dma.cc | 2 + runtime/realm/transfer/transfer.h | 6 + 8 files changed, 359 insertions(+), 215 deletions(-) diff --git a/runtime/realm/cuda/cuda_internal.cc b/runtime/realm/cuda/cuda_internal.cc index 484c127abe..9b517c5af5 100644 --- a/runtime/realm/cuda/cuda_internal.cc +++ b/runtime/realm/cuda/cuda_internal.cc @@ -474,38 +474,40 @@ namespace Realm { switch(_kind) { case XFER_GPU_TO_FB: { - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 10000; // HACK - estimate at 10 GB/s + unsigned latency = 1000; // HACK - estimate at 1 us + unsigned frag_overhead = 2000; // HACK - estimate at 2 us for(std::set::const_iterator it = src_gpu->pinned_sysmems.begin(); it != src_gpu->pinned_sysmems.end(); ++it) - add_path(*it, fbm, bw, latency, false, false, - XFER_GPU_TO_FB); + add_path(*it, fbm, bw, latency, frag_overhead, XFER_GPU_TO_FB) + .set_max_dim(2); // D->H cudamemcpy3d is unrolled into 2d copies for(std::set::const_iterator it = src_gpu->managed_mems.begin(); it != src_gpu->managed_mems.end(); ++it) - add_path(*it, fbm, bw, latency, false, false, - XFER_GPU_TO_FB); + add_path(*it, fbm, bw, latency, frag_overhead, XFER_GPU_TO_FB) + .set_max_dim(2); // D->H cudamemcpy3d is unrolled into 2d copies break; } case XFER_GPU_FROM_FB: { - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 10000; // HACK - estimate at 10 GB/s + unsigned latency = 1000; // HACK - estimate at 1 us + unsigned frag_overhead = 2000; // HACK - estimate at 2 us for(std::set::const_iterator it = src_gpu->pinned_sysmems.begin(); it != src_gpu->pinned_sysmems.end(); ++it) - add_path(fbm, *it, bw, latency, false, false, - XFER_GPU_FROM_FB); + add_path(fbm, *it, bw, latency, frag_overhead, XFER_GPU_FROM_FB) + .set_max_dim(2); // H->D cudamemcpy3d is unrolled into 2d copies for(std::set::const_iterator it = src_gpu->managed_mems.begin(); it != src_gpu->managed_mems.end(); ++it) - add_path(fbm, *it, bw, latency, false, false, - XFER_GPU_FROM_FB); + add_path(fbm, *it, bw, latency, frag_overhead, XFER_GPU_FROM_FB) + .set_max_dim(2); // H->D cudamemcpy3d is unrolled into 2d copies break; } @@ -513,10 +515,11 @@ namespace Realm { case XFER_GPU_IN_FB: { // self-path - unsigned bw = 0; // TODO - unsigned latency = 0; - add_path(fbm, fbm, bw, latency, false, false, - XFER_GPU_IN_FB); + unsigned bw = 200000; // HACK - estimate at 200 GB/s + unsigned latency = 250; // HACK - estimate at 250 ns + unsigned frag_overhead = 2000; // HACK - estimate at 2 us + add_path(fbm, fbm, bw, latency, frag_overhead, XFER_GPU_IN_FB) + .set_max_dim(3); break; } @@ -524,13 +527,14 @@ namespace Realm { case XFER_GPU_PEER_FB: { // just do paths to peers - they'll do the other side - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 50000; // HACK - estimate at 50 GB/s + unsigned latency = 1000; // HACK - estimate at 1 us + unsigned frag_overhead = 2000; // HACK - estimate at 2 us for(std::set::const_iterator it = src_gpu->peer_fbs.begin(); it != src_gpu->peer_fbs.end(); ++it) - add_path(fbm, *it, bw, latency, false, false, - XFER_GPU_PEER_FB); + add_path(fbm, *it, bw, latency, frag_overhead, XFER_GPU_PEER_FB) + .set_max_dim(3); break; } @@ -1019,11 +1023,12 @@ namespace Realm { { Memory fbm = gpu->fbmem->me; - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 300000; // HACK - estimate at 300 GB/s + unsigned latency = 250; // HACK - estimate at 250 ns + unsigned frag_overhead = 2000; // HACK - estimate at 2 us - add_path(Memory::NO_MEMORY, fbm, - bw, latency, false, false, XFER_GPU_IN_FB); + add_path(Memory::NO_MEMORY, fbm, bw, latency, frag_overhead, XFER_GPU_IN_FB) + .set_max_dim(2); xdq.add_to_manager(bgwork); } @@ -1355,19 +1360,20 @@ namespace Realm { { Memory fbm = gpu->fbmem->me; - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 100000; // HACK - estimate at 100 GB/s + unsigned latency = 250; // HACK - estimate at 250 ns + unsigned frag_overhead = 2000; // HACK - estimate at 2 us // intra-FB reduction - add_path(fbm, fbm, - bw, latency, true /*redops*/, false, XFER_GPU_IN_FB); + add_path(fbm, fbm, bw, latency, frag_overhead, XFER_GPU_IN_FB) + .allow_redops(); // zero-copy to FB (no need for intermediate buffer in FB) for(std::set::const_iterator it = gpu->pinned_sysmems.begin(); it != gpu->pinned_sysmems.end(); ++it) - add_path(*it, fbm, - bw, latency, true /*redops*/, false, XFER_GPU_IN_FB); + add_path(*it, fbm, bw, latency, frag_overhead, XFER_GPU_TO_FB) + .allow_redops(); // unlike normal cuda p2p copies where we want to push from the source, // reductions are always sent to the destination memory's gpu to keep the @@ -1375,8 +1381,8 @@ namespace Realm { for(std::set::const_iterator it = gpu->peer_fbs.begin(); it != gpu->peer_fbs.end(); ++it) - add_path(*it, fbm, - bw, latency, true /*redops*/, false, XFER_GPU_IN_FB); + add_path(*it, fbm, bw, latency, frag_overhead, XFER_GPU_PEER_FB) + .allow_redops(); xdq.add_to_manager(bgwork); } @@ -1397,23 +1403,27 @@ namespace Realm { return true; } - bool GPUreduceChannel::supports_path(Memory src_mem, Memory dst_mem, - CustomSerdezID src_serdez_id, - CustomSerdezID dst_serdez_id, - ReductionOpID redop_id, - XferDesKind *kind_ret /*= 0*/, - unsigned *bw_ret /*= 0*/, - unsigned *lat_ret /*= 0*/) + uint64_t GPUreduceChannel::supports_path(Memory src_mem, Memory dst_mem, + CustomSerdezID src_serdez_id, + CustomSerdezID dst_serdez_id, + ReductionOpID redop_id, + size_t total_bytes, + const std::vector *src_frags, + const std::vector *dst_frags, + XferDesKind *kind_ret /*= 0*/, + unsigned *bw_ret /*= 0*/, + unsigned *lat_ret /*= 0*/) { - // give all the normal supports_path logic a chance to reject it first - if(!Channel::supports_path(src_mem, dst_mem, src_serdez_id, dst_serdez_id, - redop_id, kind_ret, bw_ret, lat_ret)) - return false; + // first check that we have a reduction op (if not, we want the cudamemcpy + // path to pick this up instead) and that it has cuda kernels available + if(!is_gpu_redop(redop_id)) + return 0; - // if everything else was ok, check that we have a reduction op (if not, - // we want the cudamemcpy path to pick this up instead) and that it has - // cuda kernels available - return is_gpu_redop(redop_id); + // then delegate to the normal supports_path logic + return Channel::supports_path(src_mem, dst_mem, + src_serdez_id, dst_serdez_id, redop_id, + total_bytes, src_frags, dst_frags, + kind_ret, bw_ret, lat_ret); } RemoteChannelInfo *GPUreduceChannel::construct_remote_info() const @@ -1509,23 +1519,27 @@ namespace Realm { : RemoteChannel(_remote_ptr) {} - bool GPUreduceRemoteChannel::supports_path(Memory src_mem, Memory dst_mem, - CustomSerdezID src_serdez_id, - CustomSerdezID dst_serdez_id, - ReductionOpID redop_id, - XferDesKind *kind_ret /*= 0*/, - unsigned *bw_ret /*= 0*/, - unsigned *lat_ret /*= 0*/) + uint64_t GPUreduceRemoteChannel::supports_path(Memory src_mem, Memory dst_mem, + CustomSerdezID src_serdez_id, + CustomSerdezID dst_serdez_id, + ReductionOpID redop_id, + size_t total_bytes, + const std::vector *src_frags, + const std::vector *dst_frags, + XferDesKind *kind_ret /*= 0*/, + unsigned *bw_ret /*= 0*/, + unsigned *lat_ret /*= 0*/) { - // give all the normal supports_path logic a chance to reject it first - if(!Channel::supports_path(src_mem, dst_mem, src_serdez_id, dst_serdez_id, - redop_id, kind_ret, bw_ret, lat_ret)) - return false; + // check first that we have a reduction op (if not, we want the cudamemcpy + // path to pick this up instead) and that it has cuda kernels available + if(!GPUreduceChannel::is_gpu_redop(redop_id)) + return 0; - // if everything else was ok, check that we have a reduction op (if not, - // we want the cudamemcpy path to pick this up instead) and that it has - // cuda kernels available - return GPUreduceChannel::is_gpu_redop(redop_id); + // then delegate to the normal supports_path logic + return Channel::supports_path(src_mem, dst_mem, + src_serdez_id, dst_serdez_id, redop_id, + total_bytes, src_frags, dst_frags, + kind_ret, bw_ret, lat_ret); } diff --git a/runtime/realm/cuda/cuda_internal.h b/runtime/realm/cuda/cuda_internal.h index e12b11ee36..a6e4fbbfca 100644 --- a/runtime/realm/cuda/cuda_internal.h +++ b/runtime/realm/cuda/cuda_internal.h @@ -931,13 +931,16 @@ namespace Realm { // override this because we have to be picky about which reduction ops // we support - virtual bool supports_path(Memory src_mem, Memory dst_mem, - CustomSerdezID src_serdez_id, - CustomSerdezID dst_serdez_id, - ReductionOpID redop_id, - XferDesKind *kind_ret = 0, - unsigned *bw_ret = 0, - unsigned *lat_ret = 0); + virtual uint64_t supports_path(Memory src_mem, Memory dst_mem, + CustomSerdezID src_serdez_id, + CustomSerdezID dst_serdez_id, + ReductionOpID redop_id, + size_t total_bytes, + const std::vector *src_frags, + const std::vector *dst_frags, + XferDesKind *kind_ret = 0, + unsigned *bw_ret = 0, + unsigned *lat_ret = 0); virtual RemoteChannelInfo *construct_remote_info() const; @@ -982,13 +985,16 @@ namespace Realm { GPUreduceRemoteChannel(uintptr_t _remote_ptr); - virtual bool supports_path(Memory src_mem, Memory dst_mem, - CustomSerdezID src_serdez_id, - CustomSerdezID dst_serdez_id, - ReductionOpID redop_id, - XferDesKind *kind_ret = 0, - unsigned *bw_ret = 0, - unsigned *lat_ret = 0); + virtual uint64_t supports_path(Memory src_mem, Memory dst_mem, + CustomSerdezID src_serdez_id, + CustomSerdezID dst_serdez_id, + ReductionOpID redop_id, + size_t total_bytes, + const std::vector *src_frags, + const std::vector *dst_frags, + XferDesKind *kind_ret = 0, + unsigned *bw_ret = 0, + unsigned *lat_ret = 0); }; #ifdef REALM_CUDA_DYNAMIC_LOAD diff --git a/runtime/realm/hdf5/hdf5_internal.cc b/runtime/realm/hdf5/hdf5_internal.cc index a480e86bd6..e54bc6a244 100644 --- a/runtime/realm/hdf5/hdf5_internal.cc +++ b/runtime/realm/hdf5/hdf5_internal.cc @@ -588,23 +588,24 @@ namespace Realm { XFER_NONE /*FIXME*/, "hdf5 channel") { - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 10; // HACK - estimate 10 MB/s + unsigned latency = 10000; // HACK - estimate 10 us + unsigned frag_overhead = 10000; // HACK - estimate 10 us // any combination of SYSTEM/REGDMA/Z_COPY_MEM for(size_t i = 0; i < num_cpu_mem_kinds; i++) { add_path(Memory::HDF_MEM, false, cpu_mem_kinds[i], false, - bw, latency, false, false, XFER_HDF5_READ); + bw, latency, frag_overhead, XFER_HDF5_READ); add_path(cpu_mem_kinds[i], false, Memory::HDF_MEM, false, - bw, latency, false, false, XFER_HDF5_WRITE); + bw, latency, frag_overhead, XFER_HDF5_WRITE); } // also indicate willingness to handle fills to HDF5 (src == NO_MEMORY) add_path(Memory::NO_MEMORY, Memory::HDF_MEM, false, - bw, latency, false, false, XFER_HDF5_WRITE); + bw, latency, frag_overhead, XFER_HDF5_WRITE); } HDF5Channel::~HDF5Channel() {} diff --git a/runtime/realm/transfer/channel.cc b/runtime/realm/transfer/channel.cc index 9ec0e095f1..2ad00fe7fe 100644 --- a/runtime/realm/transfer/channel.cc +++ b/runtime/realm/transfer/channel.cc @@ -4160,6 +4160,44 @@ namespace Realm { return true; } + + //////////////////////////////////////////////////////////////////////// + // + // class Channel::SupportedPath + // + + Channel::SupportedPath& Channel::SupportedPath::set_max_dim(int src_and_dst_dim) + { + max_src_dim = max_dst_dim = src_and_dst_dim; + return *this; + } + + Channel::SupportedPath& Channel::SupportedPath::set_max_dim(int src_dim, + int dst_dim) + { + max_src_dim = src_dim; + max_dst_dim = dst_dim; + return *this; + } + + Channel::SupportedPath& Channel::SupportedPath::allow_redops() + { + redops_allowed = true; + return *this; + } + + Channel::SupportedPath& Channel::SupportedPath::allow_serdez() + { + serdez_allowed = true; + return *this; + } + + + //////////////////////////////////////////////////////////////////////// + // + // class Channel + // + std::ostream& operator<<(std::ostream& os, const Channel::SupportedPath& p) { switch(p.src_type) { @@ -4223,13 +4261,16 @@ namespace Realm { return paths; } - bool Channel::supports_path(Memory src_mem, Memory dst_mem, - CustomSerdezID src_serdez_id, - CustomSerdezID dst_serdez_id, - ReductionOpID redop_id, - XferDesKind *kind_ret /*= 0*/, - unsigned *bw_ret /*= 0*/, - unsigned *lat_ret /*= 0*/) + uint64_t Channel::supports_path(Memory src_mem, Memory dst_mem, + CustomSerdezID src_serdez_id, + CustomSerdezID dst_serdez_id, + ReductionOpID redop_id, + size_t total_bytes, + const std::vector *src_frags, + const std::vector *dst_frags, + XferDesKind *kind_ret /*= 0*/, + unsigned *bw_ret /*= 0*/, + unsigned *lat_ret /*= 0*/) { for(std::vector::const_iterator it = paths.begin(); it != paths.end(); @@ -4342,16 +4383,29 @@ namespace Realm { if(kind_ret) *kind_ret = it->xd_kind; if(bw_ret) *bw_ret = it->bandwidth; if(lat_ret) *lat_ret = it->latency; - return true; + + // estimate transfer time + uint64_t xfer_time = uint64_t(total_bytes) * 1000 / it->bandwidth; + uint64_t frags = 1; + if(src_frags) + frags = std::max(frags, (*src_frags)[std::min(src_frags->size()-1, + it->max_src_dim)]); + if(dst_frags) + frags = std::max(frags, (*dst_frags)[std::min(dst_frags->size()-1, + it->max_dst_dim)]); + xfer_time += frags * it->frag_overhead; + + // make sure returned value is strictly positive + return std::max(xfer_time, 1); } - return false; + return 0; } - void Channel::add_path(Memory src_mem, Memory dst_mem, - unsigned bandwidth, unsigned latency, - bool redops_allowed, bool serdez_allowed, - XferDesKind xd_kind) + Channel::SupportedPath& Channel::add_path(Memory src_mem, Memory dst_mem, + unsigned bandwidth, unsigned latency, + unsigned frag_overhead, + XferDesKind xd_kind) { size_t idx = paths.size(); paths.resize(idx + 1); @@ -4362,15 +4416,18 @@ namespace Realm { p.dst_mem = dst_mem; p.bandwidth = bandwidth; p.latency = latency; - p.redops_allowed = redops_allowed; - p.serdez_allowed = serdez_allowed; + p.frag_overhead = frag_overhead; + p.max_src_dim = p.max_dst_dim = 1; // default + p.redops_allowed = false; // default + p.serdez_allowed = false; // default p.xd_kind = xd_kind; + return p; } - void Channel::add_path(Memory src_mem, Memory::Kind dst_kind, bool dst_global, - unsigned bandwidth, unsigned latency, - bool redops_allowed, bool serdez_allowed, - XferDesKind xd_kind) + Channel::SupportedPath& Channel::add_path(Memory src_mem, Memory::Kind dst_kind, bool dst_global, + unsigned bandwidth, unsigned latency, + unsigned frag_overhead, + XferDesKind xd_kind) { size_t idx = paths.size(); paths.resize(idx + 1); @@ -4382,16 +4439,19 @@ namespace Realm { p.dst_kind = dst_kind; p.bandwidth = bandwidth; p.latency = latency; - p.redops_allowed = redops_allowed; - p.serdez_allowed = serdez_allowed; + p.frag_overhead = frag_overhead; + p.max_src_dim = p.max_dst_dim = 1; // default + p.redops_allowed = false; // default + p.serdez_allowed = false; // default p.xd_kind = xd_kind; + return p; } - void Channel::add_path(Memory::Kind src_kind, bool src_global, - Memory::Kind dst_kind, bool dst_global, - unsigned bandwidth, unsigned latency, - bool redops_allowed, bool serdez_allowed, - XferDesKind xd_kind) + Channel::SupportedPath& Channel::add_path(Memory::Kind src_kind, bool src_global, + Memory::Kind dst_kind, bool dst_global, + unsigned bandwidth, unsigned latency, + unsigned frag_overhead, + XferDesKind xd_kind) { size_t idx = paths.size(); paths.resize(idx + 1); @@ -4404,16 +4464,19 @@ namespace Realm { p.dst_kind = dst_kind; p.bandwidth = bandwidth; p.latency = latency; - p.redops_allowed = redops_allowed; - p.serdez_allowed = serdez_allowed; + p.frag_overhead = frag_overhead; + p.max_src_dim = p.max_dst_dim = 1; // default + p.redops_allowed = false; // default + p.serdez_allowed = false; // default p.xd_kind = xd_kind; + return p; } // TODO: allow rdma path to limit by kind? - void Channel::add_path(bool local_loopback, - unsigned bandwidth, unsigned latency, - bool redops_allowed, bool serdez_allowed, - XferDesKind xd_kind) + Channel::SupportedPath& Channel::add_path(bool local_loopback, + unsigned bandwidth, unsigned latency, + unsigned frag_overhead, + XferDesKind xd_kind) { size_t idx = paths.size(); paths.resize(idx + 1); @@ -4423,9 +4486,12 @@ namespace Realm { SupportedPath::REMOTE_RDMA); p.bandwidth = bandwidth; p.latency = latency; - p.redops_allowed = redops_allowed; - p.serdez_allowed = serdez_allowed; + p.frag_overhead = frag_overhead; + p.max_src_dim = p.max_dst_dim = 1; // default + p.redops_allowed = false; // default + p.serdez_allowed = false; // default p.xd_kind = xd_kind; + return p; } long Channel::progress_xd(XferDes *xd, long max_nr) @@ -4669,23 +4735,26 @@ namespace Realm { return 0; } - bool RemoteChannel::supports_path(Memory src_mem, Memory dst_mem, - CustomSerdezID src_serdez_id, - CustomSerdezID dst_serdez_id, - ReductionOpID redop_id, - XferDesKind *kind_ret /*= 0*/, - unsigned *bw_ret /*= 0*/, - unsigned *lat_ret /*= 0*/) + uint64_t RemoteChannel::supports_path(Memory src_mem, Memory dst_mem, + CustomSerdezID src_serdez_id, + CustomSerdezID dst_serdez_id, + ReductionOpID redop_id, + size_t total_bytes, + const std::vector *src_frags, + const std::vector *dst_frags, + XferDesKind *kind_ret /*= 0*/, + unsigned *bw_ret /*= 0*/, + unsigned *lat_ret /*= 0*/) { // simultaneous serialization/deserialization not // allowed anywhere right now if((src_serdez_id != 0) && (dst_serdez_id != 0)) - return false; + return 0; // fall through to normal checks return Channel::supports_path(src_mem, dst_mem, - src_serdez_id, dst_serdez_id, - redop_id, + src_serdez_id, dst_serdez_id, redop_id, + total_bytes, src_frags, dst_frags, kind_ret, bw_ret, lat_ret); } @@ -4708,14 +4777,17 @@ namespace Realm { "memcpy channel") { //cbs = (MemcpyRequest**) calloc(max_nr, sizeof(MemcpyRequest*)); - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 5000; // HACK - estimate at 5 GB/s + unsigned latency = 100; // HACK - estimate at 100ns + unsigned frag_overhead = 100; // HACK - estimate at 100ns // any combination of SYSTEM/REGDMA/Z_COPY/SOCKET_MEM for(size_t i = 0; i < num_cpu_mem_kinds; i++) for(size_t j = 0; j < num_cpu_mem_kinds; j++) add_path(cpu_mem_kinds[i], false, cpu_mem_kinds[j], false, - bw, latency, false, true, XFER_MEM_CPY); + bw, latency, frag_overhead, XFER_MEM_CPY) + .set_max_dim(3) + .allow_serdez(); xdq.add_to_manager(bgwork); } @@ -4725,23 +4797,26 @@ namespace Realm { //free(cbs); } - bool MemcpyChannel::supports_path(Memory src_mem, Memory dst_mem, - CustomSerdezID src_serdez_id, - CustomSerdezID dst_serdez_id, - ReductionOpID redop_id, - XferDesKind *kind_ret /*= 0*/, - unsigned *bw_ret /*= 0*/, - unsigned *lat_ret /*= 0*/) + uint64_t MemcpyChannel::supports_path(Memory src_mem, Memory dst_mem, + CustomSerdezID src_serdez_id, + CustomSerdezID dst_serdez_id, + ReductionOpID redop_id, + size_t total_bytes, + const std::vector *src_frags, + const std::vector *dst_frags, + XferDesKind *kind_ret /*= 0*/, + unsigned *bw_ret /*= 0*/, + unsigned *lat_ret /*= 0*/) { // simultaneous serialization/deserialization not // allowed anywhere right now if((src_serdez_id != 0) && (dst_serdez_id != 0)) - return false; + return 0; // fall through to normal checks return Channel::supports_path(src_mem, dst_mem, - src_serdez_id, dst_serdez_id, - redop_id, + src_serdez_id, dst_serdez_id, redop_id, + total_bytes, src_frags, dst_frags, kind_ret, bw_ret, lat_ret); } @@ -5230,13 +5305,15 @@ namespace Realm { XFER_MEM_FILL, "memfill channel") { - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 10000; // HACK - estimate at 10 GB/s + unsigned latency = 100; // HACK - estimate at 100ns + unsigned frag_overhead = 100; // HACK - estimate at 100ns // any of SYSTEM/REGDMA/Z_COPY/SOCKET_MEM is a valid destination for(size_t i = 0; i < num_cpu_mem_kinds; i++) add_path(Memory::NO_MEMORY, cpu_mem_kinds[i], false, - bw, latency, false, false, XFER_MEM_FILL); + bw, latency, frag_overhead, XFER_MEM_FILL) + .set_max_dim(3); xdq.add_to_manager(bgwork); } @@ -5279,33 +5356,41 @@ namespace Realm { XFER_MEM_CPY, "memreduce channel") { - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 1000; // HACK - estimate at 1 GB/s + unsigned latency = 100; // HACK - estimate at 100ns + unsigned frag_overhead = 100; // HACK - estimate at 100ns // any combination of SYSTEM/REGDMA/Z_COPY/SOCKET_MEM for(size_t i = 0; i < num_cpu_mem_kinds; i++) for(size_t j = 0; j < num_cpu_mem_kinds; j++) add_path(cpu_mem_kinds[i], false, cpu_mem_kinds[j], false, - bw, latency, true, false, XFER_MEM_CPY); + bw, latency, frag_overhead, XFER_MEM_CPY) + .set_max_dim(3) + .allow_redops(); xdq.add_to_manager(bgwork); } - bool MemreduceChannel::supports_path(Memory src_mem, Memory dst_mem, - CustomSerdezID src_serdez_id, - CustomSerdezID dst_serdez_id, - ReductionOpID redop_id, - XferDesKind *kind_ret /*= 0*/, - unsigned *bw_ret /*= 0*/, - unsigned *lat_ret /*= 0*/) + uint64_t MemreduceChannel::supports_path(Memory src_mem, Memory dst_mem, + CustomSerdezID src_serdez_id, + CustomSerdezID dst_serdez_id, + ReductionOpID redop_id, + size_t total_bytes, + const std::vector *src_frags, + const std::vector *dst_frags, + XferDesKind *kind_ret /*= 0*/, + unsigned *bw_ret /*= 0*/, + unsigned *lat_ret /*= 0*/) { // if it's not a reduction, we don't want it if(redop_id == 0) - return false; + return 0; // otherwise consult the normal supports_path logic - return Channel::supports_path(src_mem, dst_mem, src_serdez_id, dst_serdez_id, - redop_id, kind_ret, bw_ret, lat_ret); + return Channel::supports_path(src_mem, dst_mem, + src_serdez_id, dst_serdez_id, redop_id, + total_bytes, src_frags, dst_frags, + kind_ret, bw_ret, lat_ret); } XferDes *MemreduceChannel::create_xfer_des(uintptr_t dma_op, @@ -5343,18 +5428,19 @@ namespace Realm { _kind, stringbuilder() << "gasnet channel (kind= " << _kind << ")") { - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 1000; // HACK - estimate at 1 GB/s + unsigned latency = 5000; // HACK - estimate at 5 us + unsigned frag_overhead = 1000; // HACK - estimate at 1 us // any combination of SYSTEM/REGDMA/Z_COPY/SOCKET_MEM for(size_t i = 0; i < num_cpu_mem_kinds; i++) if(_kind == XFER_GASNET_READ) add_path(Memory::GLOBAL_MEM, true, cpu_mem_kinds[i], false, - bw, latency, false, false, XFER_GASNET_READ); + bw, latency, frag_overhead, XFER_GASNET_READ); else add_path(cpu_mem_kinds[i], false, Memory::GLOBAL_MEM, true, - bw, latency, false, false, XFER_GASNET_WRITE); + bw, latency, frag_overhead, XFER_GASNET_WRITE); } GASNetChannel::~GASNetChannel() @@ -5413,17 +5499,18 @@ namespace Realm { XFER_REMOTE_WRITE, "remote write channel") { - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 5000; // HACK - estimate at 5 GB/s + unsigned latency = 2000; // HACK - estimate at 2 us + unsigned frag_overhead = 1000; // HACK - estimate at 1 us // any combination of SYSTEM/REGDMA/Z_COPY/SOCKET_MEM // for(size_t i = 0; i < num_cpu_mem_kinds; i++) // add_path(cpu_mem_kinds[i], false, // Memory::REGDMA_MEM, true, // bw, latency, false, false, XFER_REMOTE_WRITE); add_path(false /*!local_loopback*/, - bw, latency, - false /*!redops*/, false /*!serdez*/, + bw, latency, frag_overhead, XFER_REMOTE_WRITE); + // TODO: permit 2d sources? } RemoteWriteChannel::~RemoteWriteChannel() {} diff --git a/runtime/realm/transfer/channel.h b/runtime/realm/transfer/channel.h index fc2bcb08a8..b992ddfefc 100644 --- a/runtime/realm/transfer/channel.h +++ b/runtime/realm/transfer/channel.h @@ -781,19 +781,33 @@ namespace Realm { XferDesKind xd_kind; unsigned bandwidth; // units = MB/s = B/us unsigned latency; // units = ns + unsigned frag_overhead; // units = ns + unsigned char max_src_dim, max_dst_dim; bool redops_allowed; // TODO: list of redops? bool serdez_allowed; // TODO: list of serdez ops? + + // mutators to modify less-common fields + SupportedPath& set_max_dim(int src_and_dst_dim); + SupportedPath& set_max_dim(int src_dim, int dst_dim); + SupportedPath& allow_redops(); + SupportedPath& allow_serdez(); }; const std::vector& get_paths(void) const; - virtual bool supports_path(Memory src_mem, Memory dst_mem, - CustomSerdezID src_serdez_id, - CustomSerdezID dst_serdez_id, - ReductionOpID redop_id, - XferDesKind *kind_ret = 0, - unsigned *bw_ret = 0, - unsigned *lat_ret = 0); + // returns 0 if the path is not supported, or a strictly-positive + // estimate of the time required (in nanoseconds) to transfer data + // along a supported path + virtual uint64_t supports_path(Memory src_mem, Memory dst_mem, + CustomSerdezID src_serdez_id, + CustomSerdezID dst_serdez_id, + ReductionOpID redop_id, + size_t total_bytes, + const std::vector *src_frags, + const std::vector *dst_frags, + XferDesKind *kind_ret = 0, + unsigned *bw_ret = 0, + unsigned *lat_ret = 0); virtual RemoteChannelInfo *construct_remote_info() const; @@ -803,24 +817,27 @@ namespace Realm { virtual void wakeup_xd(XferDes *xd) = 0; protected: - void add_path(Memory src_mem, Memory dst_mem, - unsigned bandwidth, unsigned latency, - bool redops_allowed, bool serdez_allowed, - XferDesKind xd_kind); - void add_path(Memory src_mem, Memory::Kind dst_kind, bool dst_global, - unsigned bandwidth, unsigned latency, - bool redops_allowed, bool serdez_allowed, - XferDesKind xd_kind); - void add_path(Memory::Kind src_kind, bool src_global, - Memory::Kind dst_kind, bool dst_global, - unsigned bandwidth, unsigned latency, - bool redops_allowed, bool serdez_allowed, - XferDesKind xd_kind); + // returns the added path for further modification, but reference is + // only valid until the next call to 'add_path' + SupportedPath& add_path(Memory src_mem, Memory dst_mem, + unsigned bandwidth, unsigned latency, + unsigned frag_overhead, + XferDesKind xd_kind); + SupportedPath& add_path(Memory src_mem, + Memory::Kind dst_kind, bool dst_global, + unsigned bandwidth, unsigned latency, + unsigned frag_overhead, + XferDesKind xd_kind); + SupportedPath& add_path(Memory::Kind src_kind, bool src_global, + Memory::Kind dst_kind, bool dst_global, + unsigned bandwidth, unsigned latency, + unsigned frag_overhead, + XferDesKind xd_kind); // TODO: allow rdma path to limit by kind? - void add_path(bool local_loopback, - unsigned bandwidth, unsigned latency, - bool redops_allowed, bool serdez_allowed, - XferDesKind xd_kind); + SupportedPath& add_path(bool local_loopback, + unsigned bandwidth, unsigned latency, + unsigned frag_overhead, + XferDesKind xd_kind); std::vector paths; }; @@ -916,13 +933,16 @@ namespace Realm { */ virtual long available(); - virtual bool supports_path(Memory src_mem, Memory dst_mem, - CustomSerdezID src_serdez_id, - CustomSerdezID dst_serdez_id, - ReductionOpID redop_id, - XferDesKind *kind_ret = 0, - unsigned *bw_ret = 0, - unsigned *lat_ret = 0); + virtual uint64_t supports_path(Memory src_mem, Memory dst_mem, + CustomSerdezID src_serdez_id, + CustomSerdezID dst_serdez_id, + ReductionOpID redop_id, + size_t total_bytes, + const std::vector *src_frags, + const std::vector *dst_frags, + XferDesKind *kind_ret = 0, + unsigned *bw_ret = 0, + unsigned *lat_ret = 0); virtual void enqueue_ready_xd(XferDes *xd) { assert(0); } virtual void wakeup_xd(XferDes *xd) { assert(0); } @@ -980,13 +1000,16 @@ namespace Realm { ~MemcpyChannel(); - virtual bool supports_path(Memory src_mem, Memory dst_mem, - CustomSerdezID src_serdez_id, - CustomSerdezID dst_serdez_id, - ReductionOpID redop_id, - XferDesKind *kind_ret = 0, - unsigned *bw_ret = 0, - unsigned *lat_ret = 0); + virtual uint64_t supports_path(Memory src_mem, Memory dst_mem, + CustomSerdezID src_serdez_id, + CustomSerdezID dst_serdez_id, + ReductionOpID redop_id, + size_t total_bytes, + const std::vector *src_frags, + const std::vector *dst_frags, + XferDesKind *kind_ret = 0, + unsigned *bw_ret = 0, + unsigned *lat_ret = 0); virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node, @@ -1033,13 +1056,16 @@ namespace Realm { static const bool is_ordered = false; // override because we don't want to claim non-reduction copies - virtual bool supports_path(Memory src_mem, Memory dst_mem, - CustomSerdezID src_serdez_id, - CustomSerdezID dst_serdez_id, - ReductionOpID redop_id, - XferDesKind *kind_ret = 0, - unsigned *bw_ret = 0, - unsigned *lat_ret = 0); + virtual uint64_t supports_path(Memory src_mem, Memory dst_mem, + CustomSerdezID src_serdez_id, + CustomSerdezID dst_serdez_id, + ReductionOpID redop_id, + size_t total_bytes, + const std::vector *src_frags, + const std::vector *dst_frags, + XferDesKind *kind_ret = 0, + unsigned *bw_ret = 0, + unsigned *lat_ret = 0); virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node, diff --git a/runtime/realm/transfer/channel_disk.cc b/runtime/realm/transfer/channel_disk.cc index 05171274f3..acfa2b3594 100644 --- a/runtime/realm/transfer/channel_disk.cc +++ b/runtime/realm/transfer/channel_disk.cc @@ -254,17 +254,18 @@ namespace Realm { XFER_NONE /*FIXME*/, "file channel") { - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 10; // HACK - estimate 10 MB/s + unsigned latency = 10000; // HACK - estimate 10 us + unsigned frag_overhead = 10000; // HACK - estimate 10 us // any combination of SYSTEM/REGDMA/Z_COPY_MEM for(size_t i = 0; i < num_cpu_mem_kinds; i++) { add_path(Memory::FILE_MEM, false, cpu_mem_kinds[i], false, - bw, latency, false, false, XFER_FILE_READ); + bw, latency, frag_overhead, XFER_FILE_READ); add_path(cpu_mem_kinds[i], false, Memory::FILE_MEM, false, - bw, latency, false, false, XFER_FILE_WRITE); + bw, latency, frag_overhead, XFER_FILE_WRITE); } } @@ -317,17 +318,18 @@ namespace Realm { XFER_NONE /*FIXME*/, "disk channel") { - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 10; // HACK - estimate 10 MB/s + unsigned latency = 10000; // HACK - estimate 10 us + unsigned frag_overhead = 10000; // HACK - estimate 10 us // any combination of SYSTEM/REGDMA/Z_COPY_MEM for(size_t i = 0; i < num_cpu_mem_kinds; i++) { add_path(Memory::DISK_MEM, false, cpu_mem_kinds[i], false, - bw, latency, false, false, XFER_DISK_READ); + bw, latency, frag_overhead, XFER_DISK_READ); add_path(cpu_mem_kinds[i], false, Memory::DISK_MEM, false, - bw, latency, false, false, XFER_DISK_WRITE); + bw, latency, frag_overhead, XFER_DISK_WRITE); } } diff --git a/runtime/realm/transfer/lowlevel_dma.cc b/runtime/realm/transfer/lowlevel_dma.cc index 9640b779af..6036269495 100644 --- a/runtime/realm/transfer/lowlevel_dma.cc +++ b/runtime/realm/transfer/lowlevel_dma.cc @@ -559,6 +559,7 @@ namespace Realm { if((*it)->supports_path(src_mem, dst_mem, src_serdez_id, dst_serdez_id, redop_id, + 0, 0, 0, // FIXME &kind, &bw, &latency)) { channel = *it; break; @@ -576,6 +577,7 @@ namespace Realm { if((*it)->supports_path(src_mem, dst_mem, src_serdez_id, dst_serdez_id, redop_id, + 0, 0, 0, // FIXME &kind, &bw, &latency)) { channel = *it; break; diff --git a/runtime/realm/transfer/transfer.h b/runtime/realm/transfer/transfer.h index 19d332a341..5781054263 100644 --- a/runtime/realm/transfer/transfer.h +++ b/runtime/realm/transfer/transfer.h @@ -139,6 +139,12 @@ namespace Realm { bool force_fortran_order, size_t max_stride) const = 0; + virtual void count_fragments(RegionInstance inst, + const std::vector& dim_order, + const std::vector& fields, + const std::vector& fld_sizes, + std::vector& fragments) const = 0; + virtual TransferIterator *create_iterator(RegionInstance inst, const std::vector& dim_order, const std::vector& fields, From 3422a044e0a49b7fa0997c127729692323c04888 Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Tue, 14 Dec 2021 21:30:26 -0800 Subject: [PATCH 24/36] realm: slightly less hacky way of choosing ib memory for address splitters --- runtime/realm/transfer/transfer.cc | 35 +++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/runtime/realm/transfer/transfer.cc b/runtime/realm/transfer/transfer.cc index d1e0de757d..627198ddcd 100644 --- a/runtime/realm/transfer/transfer.cc +++ b/runtime/realm/transfer/transfer.cc @@ -2276,6 +2276,25 @@ namespace Realm { return TransferGraph::XDTemplate::mk_edge(ib_base + hops - 1); } + // address splitters need to be able to read addresses from sysmem + // TODO: query this from channel in order to support heterogeneity? + static Memory find_sysmem_ib_memory(NodeID node) + { + Node& n = get_runtime()->nodes[node]; + for(std::vector::const_iterator it = n.ib_memories.begin(); + it != n.ib_memories.end(); + ++it) + if(((*it)->lowlevel_kind == Memory::SYSTEM_MEM) || + ((*it)->lowlevel_kind == Memory::REGDMA_MEM) || + ((*it)->lowlevel_kind == Memory::SOCKET_MEM) || + ((*it)->lowlevel_kind == Memory::Z_COPY_MEM)) + return (*it)->me; + + log_dma.fatal() << "no sysmem ib memory on node " << node; + abort(); + return Memory::NO_MEMORY; + } + void IndirectionInfoBase::generate_gather_paths(Memory dst_mem, TransferGraph::XDTemplate::IO dst_edge, unsigned indirect_idx, @@ -2328,7 +2347,7 @@ namespace Realm { if((spaces_size == 1) && !oor_possible) { size_t pathlen = path_infos[0].xd_channels.size(); // HACK! - Memory local_ib_mem = ID::make_ib_memory(path_infos[0].xd_channels[0]->node, 0).convert(); + Memory local_ib_mem = find_sysmem_ib_memory(path_infos[0].xd_channels[0]->node); // do we have to do anything to get the addresses into a cpu-readable // memory on that node? MemPathInfo addr_path; @@ -2383,7 +2402,7 @@ namespace Realm { // the data to where a cpu can look at it NodeID addr_node = ID(inst).instance_owner_node(); // HACK! - Memory addr_ib_mem = ID::make_ib_memory(addr_node, 0).convert(); + Memory addr_ib_mem = find_sysmem_ib_memory(addr_node); MemPathInfo addr_path; bool ok = find_shortest_path(inst.get_location(), addr_ib_mem, @@ -2429,7 +2448,7 @@ namespace Realm { // data instances live for(size_t i = 0; i < spaces_size; i++) { // HACK! - Memory src_ib_mem = ID::make_ib_memory(ID(insts[i]).instance_owner_node(), 0).convert(); + Memory src_ib_mem = find_sysmem_ib_memory(ID(insts[i]).instance_owner_node()); if(src_ib_mem != addr_ib_mem) { MemPathInfo path; bool ok = find_shortest_path(addr_ib_mem, src_ib_mem, @@ -2446,7 +2465,7 @@ namespace Realm { // control information has to get to the merge at the end // HACK! NodeID dst_node = ID(dst_mem).memory_owner_node(); - Memory dst_ib_mem = ID::make_ib_memory(dst_node, 0).convert(); + Memory dst_ib_mem = find_sysmem_ib_memory(dst_node); if(dst_ib_mem != addr_ib_mem) { MemPathInfo path; bool ok = find_shortest_path(addr_ib_mem, dst_ib_mem, @@ -2626,7 +2645,7 @@ namespace Realm { if((spaces_size == 1) && !oor_possible) { size_t pathlen = path_infos[0].xd_channels.size(); // HACK! - Memory local_ib_mem = ID::make_ib_memory(path_infos[0].xd_channels[pathlen - 1]->node, 0).convert(); + Memory local_ib_mem = find_sysmem_ib_memory(path_infos[0].xd_channels[pathlen - 1]->node); // do we have to do anything to get the addresses into a cpu-readable // memory on that node? MemPathInfo addr_path; @@ -2683,7 +2702,7 @@ namespace Realm { // the data to where a cpu can look at it NodeID addr_node = ID(inst).instance_owner_node(); // HACK! - Memory addr_ib_mem = ID::make_ib_memory(addr_node, 0).convert(); + Memory addr_ib_mem = find_sysmem_ib_memory(addr_node); MemPathInfo addr_path; bool ok = find_shortest_path(inst.get_location(), addr_ib_mem, @@ -2730,7 +2749,7 @@ namespace Realm { for(size_t i = 0; i < spaces_size; i++) { // HACK! NodeID dst_node = path_infos[path_idx[i]].xd_channels[path_infos[path_idx[i]].xd_channels.size() - 1]->node; - Memory dst_ib_mem = ID::make_ib_memory(dst_node, 0).convert(); + Memory dst_ib_mem = find_sysmem_ib_memory(dst_node); if(dst_ib_mem != addr_ib_mem) { MemPathInfo path; bool ok = find_shortest_path(addr_ib_mem, dst_ib_mem, @@ -2746,7 +2765,7 @@ namespace Realm { // control information has to get to the split at the start // HACK! - Memory src_ib_mem = ID::make_ib_memory(ID(src_mem).memory_owner_node(), 0).convert(); + Memory src_ib_mem = find_sysmem_ib_memory(ID(src_mem).memory_owner_node()); if(src_ib_mem != addr_ib_mem) { MemPathInfo path; bool ok = find_shortest_path(addr_ib_mem, src_ib_mem, From 7d12a8edccab7fa8eae8c1f7faeb9792fbe17614 Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Fri, 6 Aug 2021 11:29:40 -0700 Subject: [PATCH 25/36] realm: try to find fastest rather than shortest dma path --- runtime/realm/transfer/channel.cc | 4 +- runtime/realm/transfer/transfer.cc | 474 ++++++++++++++++++++++++++++- 2 files changed, 470 insertions(+), 8 deletions(-) diff --git a/runtime/realm/transfer/channel.cc b/runtime/realm/transfer/channel.cc index 2ad00fe7fe..0e973c8519 100644 --- a/runtime/realm/transfer/channel.cc +++ b/runtime/realm/transfer/channel.cc @@ -4386,14 +4386,14 @@ namespace Realm { // estimate transfer time uint64_t xfer_time = uint64_t(total_bytes) * 1000 / it->bandwidth; - uint64_t frags = 1; + size_t frags = 1; if(src_frags) frags = std::max(frags, (*src_frags)[std::min(src_frags->size()-1, it->max_src_dim)]); if(dst_frags) frags = std::max(frags, (*dst_frags)[std::min(dst_frags->size()-1, it->max_dst_dim)]); - xfer_time += frags * it->frag_overhead; + xfer_time += uint64_t(frags) * it->frag_overhead; // make sure returned value is strictly positive return std::max(xfer_time, 1); diff --git a/runtime/realm/transfer/transfer.cc b/runtime/realm/transfer/transfer.cc index 627198ddcd..abd99e586c 100644 --- a/runtime/realm/transfer/transfer.cc +++ b/runtime/realm/transfer/transfer.cc @@ -32,6 +32,7 @@ namespace Realm { extern Logger log_dma; extern Logger log_ib_alloc; Logger log_xplan("xplan"); + Logger log_xpath("xpath"); //////////////////////////////////////////////////////////////////////// // @@ -171,6 +172,76 @@ namespace Realm { } } + // finds the largest subrectangle of 'domain' that starts with 'start', + // lies entirely within 'restriction', and is consistent with an iteration + // order (over the original 'domain') of 'dim_order' + // the subrectangle is returned in 'subrect', the start of the next subrect + // is in 'next_start', and the return value indicates whether the 'domain' + // has been fully covered + template + static bool next_subrect(const Rect& domain, const Point& start, + const Rect& restriction, const int *dim_order, + Rect& subrect, Point& next_start) + { + // special case for when we can do the whole domain in one subrect + if((start == domain.lo) && restriction.contains(domain)) { + subrect = domain; + return true; + } + +#ifdef DEBUG_REALM + // starting point better be inside the restriction + assert(restriction.contains(start)); +#endif + subrect.lo = start; + + for(int di = 0; di < N; di++) { + int d = dim_order[di]; + + // can we go to the end of the domain in this dimension? + if(domain.hi[d] <= restriction.hi[d]) { + // we can go to the end of the domain in this dimension ... + subrect.hi[d] = domain.hi[d]; + next_start[d] = domain.lo[d]; + + if(start[d] == domain.lo[d]) { + // ... and since we started at the start, we can continue to next dim + continue; + } else { + // ... but we have to stop since this wasn't a full span + if(++di < N) { + d = dim_order[di]; + subrect.hi[d] = start[d]; + next_start[d] = start[d] + 1; + + while(++di < N) { + d = dim_order[di]; + subrect.hi[d] = start[d]; + next_start[d] = start[d]; + } + + return false; // still more to do + } + } + } else { + // we didn't get to the end, so we'll have to pick up where we left off + subrect.hi[d] = restriction.hi[d]; + next_start[d] = restriction.hi[d] + 1; + + while(++di < N) { + d = dim_order[di]; + subrect.hi[d] = start[d]; + next_start[d] = start[d]; + } + + return false; // still more to do + } + } + + // if we got through all dimensions, we're done with this domain + return true; + } + template size_t TransferIteratorBase::step(size_t max_bytes, AddressInfo& info, unsigned flags, @@ -1359,6 +1430,12 @@ namespace Realm { bool force_fortran_order, size_t max_stride) const; + virtual void count_fragments(RegionInstance inst, + const std::vector& dim_order, + const std::vector& fields, + const std::vector& fld_sizes, + std::vector& fragments) const; + virtual TransferIterator *create_iterator(RegionInstance inst, const std::vector& dim_order, const std::vector& fields, @@ -1577,6 +1654,136 @@ namespace Realm { } } + template + static void add_fragments_for_rect(const Rect& rect, + size_t field_size, + size_t field_count, + const Point& strides, + const std::vector& dim_order, + std::vector& fragments) + { + int collapsed[N+1]; + int breaks = 0; + collapsed[0] = 1; + size_t exp_stride = field_size; + for(int di = 0; di < N; di++) { + int d = dim_order[di]; + // skip trivial dimensions + if(rect.lo[d] == rect.hi[d]) continue; + + size_t extent = size_t(rect.hi[d]) - size_t(rect.lo[d]) + 1; + + if(exp_stride == strides[d]) { + // stride match? collapse more + collapsed[breaks] *= extent; + exp_stride *= extent; + } else { + // stride mismatch - break here + breaks++; + collapsed[breaks] = extent; + exp_stride = strides[d] * extent; + } + } + + // now work back down from the top dimension and increase fragment + // count for each break + size_t frags = field_count; + for(int d = N+1; d >= 0; d--) { + if(d <= breaks) + frags *= collapsed[d]; + fragments[d] += frags; + } + } + + template + void TransferDomainIndexSpace::count_fragments(RegionInstance inst, + const std::vector& dim_order, + const std::vector& fields, + const std::vector& fld_sizes, + std::vector& fragments) const + { + RegionInstanceImpl *inst_impl = get_runtime()->get_instance_impl(inst); +#ifdef DEBUG_REALM + assert(inst_impl->metadata.is_valid()); +#endif + const InstanceLayout *inst_layout = checked_cast *>(inst_impl->metadata.layout); + + fragments.assign(N+2, 0); + + for(size_t i = 0; i < fields.size(); i++) { + FieldID fid = fields[i]; + size_t field_size = fld_sizes[i]; + + const InstancePieceList *ipl; + { + std::map::const_iterator it = inst_layout->fields.find(fid); + assert(it != inst_layout->fields.end()); + ipl = &inst_layout->piece_lists[it->second.list_idx]; + } + + IndexSpaceIterator isi(is); + + // get the piece for the first index + const InstanceLayoutPiece *layout_piece = ipl->find_piece(isi.rect.lo); + assert(layout_piece != 0); + + if(layout_piece->bounds.contains(is)) { + // easy case: one piece covers our entire domain and the iteration order + // doesn't impact the fragment count + if(layout_piece->layout_type == PieceLayoutTypes::AffineLayoutType) { + const AffineLayoutPiece *affine = static_cast *>(layout_piece); + do { + add_fragments_for_rect(isi.rect, field_size, 1 /*field count*/, + affine->strides, dim_order, fragments); + isi.step(); + } while(isi.valid); + } else { + // not affine - add one fragment for each rectangle + size_t num_rects; + if(is.dense()) { + num_rects = 1; + } else { + SparsityMapPublicImpl *s_impl = is.sparsity.impl(); + num_rects = s_impl->get_entries().size(); + } + + for(int i = 0; i < (N + 2); i++) + fragments[i] += num_rects; + } + } else { + size_t non_affine_rects = 0; + do { + Point next_start = isi.rect.lo; + while(true) { + // look up new piece if needed + if(!layout_piece->bounds.contains(next_start)) { + layout_piece = ipl->find_piece(next_start); + assert(layout_piece != 0); + } + + Rect subrect; + bool last = next_subrect(isi.rect, next_start, layout_piece->bounds, + dim_order.data(), subrect, next_start); + if(layout_piece->layout_type == PieceLayoutTypes::AffineLayoutType) { + const AffineLayoutPiece *affine = static_cast *>(layout_piece); + add_fragments_for_rect(isi.rect, field_size, 1 /*field count*/, + affine->strides, dim_order, fragments); + } else + non_affine_rects++; + + if(last) break; + } + + isi.step(); + } while(isi.valid); + + if(non_affine_rects > 0) + for(int i = 0; i < (N + 2); i++) + fragments[i] += non_affine_rects; + } + } + } + template TransferIterator *TransferDomainIndexSpace::create_iterator(RegionInstance inst, const std::vector& dim_order, @@ -2159,6 +2366,234 @@ namespace Realm { } + //////////////////////////////////////////////////////////////////////// + // + // transfer path search logic + // + + static bool best_channel_for_mem_pair(Memory src_mem, Memory dst_mem, + CustomSerdezID src_serdez_id, + CustomSerdezID dst_serdez_id, + ReductionOpID redop_id, + size_t total_bytes, + const std::vector *src_frags, + const std::vector *dst_frags, + uint64_t& best_cost, + Channel *& best_channel, + XferDesKind& best_kind) + { + // consider dma channels available on either source or dest node + NodeID src_node = ID(src_mem).memory_owner_node(); + NodeID dst_node = ID(dst_mem).memory_owner_node(); + + best_cost = 0; + best_channel = 0; + best_kind = XFER_NONE; + + { + const Node& n = get_runtime()->nodes[src_node]; + for(std::vector::const_iterator it = n.dma_channels.begin(); + it != n.dma_channels.end(); + ++it) { + XferDesKind kind = XFER_NONE; + uint64_t cost = (*it)->supports_path(src_mem, dst_mem, + src_serdez_id, dst_serdez_id, + redop_id, + total_bytes, src_frags, dst_frags, + &kind); + if((cost > 0) && ((best_cost == 0) || (cost < best_cost))) { + best_cost = cost; + best_channel = *it; + best_kind = kind; + } + } + } + + if(dst_node != src_node) { + const Node& n = get_runtime()->nodes[dst_node]; + for(std::vector::const_iterator it = n.dma_channels.begin(); + it != n.dma_channels.end(); + ++it) { + XferDesKind kind = XFER_NONE; + uint64_t cost = (*it)->supports_path(src_mem, dst_mem, + src_serdez_id, dst_serdez_id, + redop_id, + total_bytes, src_frags, dst_frags, + &kind); + if((cost > 0) && ((best_cost == 0) || (cost < best_cost))) { + best_cost = cost; + best_channel = *it; + best_kind = kind; + } + } + } + + return (best_cost != 0); + } + + static bool find_fastest_path(Memory src_mem, Memory dst_mem, + CustomSerdezID serdez_id, + ReductionOpID redop_id, + size_t total_bytes, + const std::vector *src_frags, + const std::vector *dst_frags, + MemPathInfo& info, + bool skip_final_memcpy = false) + { + std::vector empty_vec; + log_xpath.info() << "FFP: " << src_mem << "->" << dst_mem + << " serdez=" << serdez_id + << " redop=" << redop_id + << " bytes=" << total_bytes + << " frags=" << PrettyVector(*(src_frags ? src_frags : &empty_vec)) + << "/" << PrettyVector(*(dst_frags ? dst_frags : &empty_vec)); + + // baseline - is a direct path possible? + uint64_t best_cost = 0; + { + Channel *channel; + XferDesKind kind; + if(best_channel_for_mem_pair(src_mem, dst_mem, serdez_id, serdez_id, + redop_id, total_bytes, src_frags, dst_frags, + best_cost, channel, kind)) { + log_xpath.info() << "direct: " << src_mem << "->" << dst_mem + << " cost=" << best_cost; + info.path.assign(1, src_mem); + if(!skip_final_memcpy || (kind != XFER_MEM_CPY)) { + info.path.push_back(dst_mem); + info.xd_channels.assign(1, channel); + } else + info.xd_channels.clear(); + } + } + + // multi-hop search (have to do this even if a direct path exists) + // any intermediate memory on the src or dst node is a candidate + struct PartialPath { + Memory ib_mem; + uint64_t cost; + std::vector path; + std::vector channels; + }; + std::vector partials; + NodeID src_node = ID(src_mem).memory_owner_node(); + NodeID dst_node = ID(dst_mem).memory_owner_node(); + size_t num_src_ibs, total_ibs; + { + const Node& n = get_runtime()->nodes[src_node]; + num_src_ibs = n.ib_memories.size(); + partials.resize(num_src_ibs); + for(size_t i = 0; i < n.ib_memories.size(); i++) { + partials[i].ib_mem = n.ib_memories[i]->me; + partials[i].cost = 0; + } + } + if(dst_node != src_node) { + const Node& n = get_runtime()->nodes[dst_node]; + total_ibs = num_src_ibs + n.ib_memories.size(); + partials.resize(total_ibs); + for(size_t i = 0; i < n.ib_memories.size(); i++) { + partials[num_src_ibs + i].ib_mem = n.ib_memories[i]->me; + partials[num_src_ibs + i].cost = 0; + } + } else + total_ibs = num_src_ibs; + + // see which of the ib memories we can get to from the original srcmem + std::set active_ibs; + for(size_t i = 0; i < total_ibs; i++) { + uint64_t cost; + Channel *channel; + XferDesKind kind; + if(best_channel_for_mem_pair(src_mem, partials[i].ib_mem, + serdez_id, 0 /*no dst serdez*/, + 0 /*no redop on not-last hops*/, + total_bytes, src_frags, 0 /*no dst_frags*/, + cost, channel, kind)) { + log_xpath.info() << "first: " << src_mem << "->" << partials[i].ib_mem + << " cost=" << cost; + // ignore anything that's already worse than the direct path + if((best_cost == 0) || (cost < best_cost)) { + active_ibs.insert(i); + partials[i].cost = cost; + partials[i].path.resize(2); + partials[i].path[0] = src_mem; + partials[i].path[1] = partials[i].ib_mem; + partials[i].channels.assign(1, channel); + } + } + } + + // look for multi-ib-hop paths (as long as they improve on the shorter + // ones) + while(!active_ibs.empty()) { + size_t src_idx = *(active_ibs.begin()); + active_ibs.erase(active_ibs.begin()); + // an ib on the dst node isn't allowed to go back to the source + size_t first_dst_idx = ((src_idx < num_src_ibs) ? 0 : num_src_ibs); + for(size_t dst_idx = first_dst_idx; dst_idx < total_ibs; dst_idx++) { + // no self-loops either + if(dst_idx == src_idx) continue; + + uint64_t cost; + Channel *channel; + XferDesKind kind; + if(best_channel_for_mem_pair(partials[src_idx].ib_mem, + partials[dst_idx].ib_mem, + 0, 0, 0, // no serdez or redop on interhops + total_bytes, 0, 0, // no fragmentation also + cost, channel, kind)) { + size_t total_cost = partials[src_idx].cost + cost; + log_xpath.info() << "inter: " << partials[src_idx].ib_mem << "->" << partials[dst_idx].ib_mem + << " cost=" << partials[src_idx].cost << "+" << cost << " = " << total_cost << " " << dst_mem + << " cost=" << partials[i].cost << "+" << cost << " = " << total_cost << " src_frags, dst_frags; + domain->count_fragments(srcs[i].inst, dim_order, + std::vector(1, srcs[i].field_id), + std::vector(1, srcs[i].size), + src_frags); + domain->count_fragments(dsts[i].inst, dim_order, + std::vector(1, dsts[i].field_id), + std::vector(1, dsts[i].size), + dst_frags); + MemPathInfo path_info; - bool ok = find_shortest_path(src_mem, dst_mem, serdez_id, - dsts[i].redop_id, - path_info); + bool ok = find_fastest_path(src_mem, dst_mem, serdez_id, + dsts[i].redop_id, + domain_size * combined_field_size, + &src_frags, &dst_frags, + path_info); if(!ok) { log_new_dma.fatal() << "FATAL: no path found from " << src_mem << " to " << dst_mem << " (redop=" << dsts[i].redop_id << ")"; assert(0); @@ -3418,10 +3865,25 @@ namespace Realm { if(dsts[i].indirect_index == -1) { Memory dst_mem = dsts[i].inst.get_location(); + std::vector src_frags, dst_frags; + domain->count_fragments(srcs[i].inst, dim_order, + std::vector(1, srcs[i].field_id), + std::vector(1, srcs[i].size), + src_frags); + domain->count_fragments(dsts[i].inst, dim_order, + std::vector(1, dsts[i].field_id), + std::vector(1, dsts[i].size), + dst_frags); + //log_new_dma.print() << "fragments: domain=" << *domain + // << " src_inst=" << srcs[i].inst << " frags=" << PrettyVector(src_frags) + // << " dst_inst=" << dsts[i].inst << " frags=" << PrettyVector(dst_frags); + MemPathInfo path_info; - bool ok = find_shortest_path(src_mem, dst_mem, serdez_id, - 0 /*redop_id*/, - path_info); + bool ok = find_fastest_path(src_mem, dst_mem, serdez_id, + 0 /*redop_id*/, + domain_size * combined_field_size, + &src_frags, &dst_frags, + path_info); if(!ok) { log_new_dma.fatal() << "FATAL: no path found from " << src_mem << " to " << dst_mem << " (serdez=" << serdez_id << ")"; assert(0); From 3950b93eb328dad132a4c69046d3cd5227270594 Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Sat, 7 Aug 2021 08:08:14 -0700 Subject: [PATCH 26/36] realm: add dma ib mem in gpu fb with -ll:ib_fsize --- runtime/realm/cuda/cuda_internal.cc | 53 ++++++++++++++++++---- runtime/realm/cuda/cuda_internal.h | 17 ++++++- runtime/realm/cuda/cuda_module.cc | 70 +++++++++++++++++++++++++++-- runtime/realm/cuda/cuda_module.h | 2 +- 4 files changed, 127 insertions(+), 15 deletions(-) diff --git a/runtime/realm/cuda/cuda_internal.cc b/runtime/realm/cuda/cuda_internal.cc index 9b517c5af5..718a29abf0 100644 --- a/runtime/realm/cuda/cuda_internal.cc +++ b/runtime/realm/cuda/cuda_internal.cc @@ -51,12 +51,15 @@ namespace Realm { src_gpus.resize(inputs_info.size(), 0); for(size_t i = 0; i < input_ports.size(); i++) if(input_ports[i].mem->kind == MemoryImpl::MKIND_GPUFB) - src_gpus[i] = checked_cast(input_ports[0].mem)->gpu; - + src_gpus[i] = (ID(input_ports[i].mem->me).is_memory() ? + (checked_cast(input_ports[i].mem))->gpu : + (checked_cast(input_ports[i].mem))->gpu); dst_gpus.resize(outputs_info.size(), 0); for(size_t i = 0; i < output_ports.size(); i++) if(output_ports[i].mem->kind == MemoryImpl::MKIND_GPUFB) - dst_gpus[i] = checked_cast(output_ports[0].mem)->gpu; + dst_gpus[i] = (ID(output_ports[i].mem->me).is_memory() ? + (checked_cast(output_ports[i].mem))->gpu : + (checked_cast(output_ports[i].mem))->gpu); } long GPUXferDes::get_requests(Request** requests, long nr) @@ -470,6 +473,7 @@ namespace Realm { xdq.ordered_mode = false; Memory fbm = src_gpu->fbmem->me; + Memory fbib = (src_gpu->fb_ibmem ? src_gpu->fb_ibmem->me : Memory::NO_MEMORY); switch(_kind) { case XFER_GPU_TO_FB: @@ -479,16 +483,26 @@ namespace Realm { unsigned frag_overhead = 2000; // HACK - estimate at 2 us for(std::set::const_iterator it = src_gpu->pinned_sysmems.begin(); it != src_gpu->pinned_sysmems.end(); - ++it) + ++it) { add_path(*it, fbm, bw, latency, frag_overhead, XFER_GPU_TO_FB) .set_max_dim(2); // D->H cudamemcpy3d is unrolled into 2d copies + if(fbib.exists()) + add_path(*it, fbib, bw, latency, frag_overhead, XFER_GPU_TO_FB) + .set_max_dim(2); // D->H cudamemcpy3d is unrolled into 2d copies + } + for(std::set::const_iterator it = src_gpu->managed_mems.begin(); it != src_gpu->managed_mems.end(); - ++it) + ++it) { add_path(*it, fbm, bw, latency, frag_overhead, XFER_GPU_TO_FB) .set_max_dim(2); // D->H cudamemcpy3d is unrolled into 2d copies + if(fbib.exists()) + add_path(*it, fbib, bw, latency, frag_overhead, XFER_GPU_TO_FB) + .set_max_dim(2); // D->H cudamemcpy3d is unrolled into 2d copies + } + break; } @@ -499,16 +513,26 @@ namespace Realm { unsigned frag_overhead = 2000; // HACK - estimate at 2 us for(std::set::const_iterator it = src_gpu->pinned_sysmems.begin(); it != src_gpu->pinned_sysmems.end(); - ++it) + ++it) { add_path(fbm, *it, bw, latency, frag_overhead, XFER_GPU_FROM_FB) .set_max_dim(2); // H->D cudamemcpy3d is unrolled into 2d copies + if(fbib.exists()) + add_path(fbib, *it, bw, latency, frag_overhead, XFER_GPU_FROM_FB) + .set_max_dim(2); // H->D cudamemcpy3d is unrolled into 2d copies + } + for(std::set::const_iterator it = src_gpu->managed_mems.begin(); it != src_gpu->managed_mems.end(); - ++it) + ++it) { add_path(fbm, *it, bw, latency, frag_overhead, XFER_GPU_FROM_FB) .set_max_dim(2); // H->D cudamemcpy3d is unrolled into 2d copies + if(fbib.exists()) + add_path(fbib, *it, bw, latency, frag_overhead, XFER_GPU_FROM_FB) + .set_max_dim(2); // H->D cudamemcpy3d is unrolled into 2d copies + } + break; } @@ -521,6 +545,14 @@ namespace Realm { add_path(fbm, fbm, bw, latency, frag_overhead, XFER_GPU_IN_FB) .set_max_dim(3); + if(fbib.exists()) { + add_path(fbm, fbib, bw, latency, frag_overhead, XFER_GPU_IN_FB) + .set_max_dim(3); + add_path(fbib, fbm, bw, latency, frag_overhead, XFER_GPU_IN_FB) + .set_max_dim(3); + // TODO: do we need to add the self-path for the ibmem? + } + break; } @@ -532,10 +564,15 @@ namespace Realm { unsigned frag_overhead = 2000; // HACK - estimate at 2 us for(std::set::const_iterator it = src_gpu->peer_fbs.begin(); it != src_gpu->peer_fbs.end(); - ++it) + ++it) { add_path(fbm, *it, bw, latency, frag_overhead, XFER_GPU_PEER_FB) .set_max_dim(3); + if(fbib.exists()) + add_path(fbib, *it, bw, latency, frag_overhead, XFER_GPU_PEER_FB) + .set_max_dim(3); + } + break; } diff --git a/runtime/realm/cuda/cuda_internal.h b/runtime/realm/cuda/cuda_internal.h index a6e4fbbfca..a5624a2a0d 100644 --- a/runtime/realm/cuda/cuda_internal.h +++ b/runtime/realm/cuda/cuda_internal.h @@ -36,6 +36,7 @@ #include "realm/mem_impl.h" #include "realm/bgwork.h" #include "realm/transfer/channel.h" +#include "realm/transfer/ib_memory.h" #define CHECK_CUDART(cmd) do { \ cudaError_t ret = (cmd); \ @@ -103,6 +104,7 @@ namespace Realm { class GPUStream; class GPUFBMemory; class GPUZCMemory; + class GPUFBIBMemory; class GPU; class CudaModule; @@ -508,7 +510,7 @@ namespace Realm { #endif void create_processor(RuntimeImpl *runtime, size_t stack_size); - void create_fb_memory(RuntimeImpl *runtime, size_t size); + void create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size); void create_dma_channels(Realm::RuntimeImpl *r); @@ -608,9 +610,10 @@ namespace Realm { GPUWorker *worker; GPUProcessor *proc; GPUFBMemory *fbmem; + GPUFBIBMemory *fb_ibmem; CUcontext context; - CUdeviceptr fbmem_base; + CUdeviceptr fbmem_base, fb_ibmem_base; // which system memories have been registered and can be used for cuMemcpyAsync std::set pinned_sysmems; @@ -771,6 +774,16 @@ namespace Realm { NetworkSegment local_segment; }; + class GPUFBIBMemory : public IBMemory { + public: + GPUFBIBMemory(Memory _me, GPU *_gpu, CUdeviceptr _base, size_t _size); + + public: + GPU *gpu; + CUdeviceptr base; + NetworkSegment local_segment; + }; + class GPURequest; class GPUCompletionEvent : public GPUCompletionNotification { diff --git a/runtime/realm/cuda/cuda_module.cc b/runtime/realm/cuda/cuda_module.cc index a868569901..1b5c9696ec 100644 --- a/runtime/realm/cuda/cuda_module.cc +++ b/runtime/realm/cuda/cuda_module.cc @@ -2257,7 +2257,7 @@ namespace Realm { //////////////////////////////////////////////////////////////////////// // - // class GPU + // class GPUFBMemory GPUFBMemory::GPUFBMemory(Memory _me, GPU *_gpu, CUdeviceptr _base, size_t _size) : LocalManagedMemory(_me, _size, MKIND_GPUFB, 512, Memory::GPU_FB_MEM, 0) @@ -2327,6 +2327,25 @@ namespace Realm { return (cpu_base + offset); } + //////////////////////////////////////////////////////////////////////// + // + // class GPUFBIBMemory + + GPUFBIBMemory::GPUFBIBMemory(Memory _me, GPU *_gpu, + CUdeviceptr _base, size_t _size) + : IBMemory(_me, _size, MKIND_GPUFB, Memory::GPU_FB_MEM, + reinterpret_cast(_base), 0) + , gpu(_gpu) + , base(_base) + { + // advertise for potential gpudirect support + local_segment.assign(NetworkSegmentInfo::CudaDeviceMem, + reinterpret_cast(_base), _size, + reinterpret_cast(_gpu)); + segment = &local_segment; + } + + // Helper methods for emulating the cuda runtime /*static*/ GPUProcessor* GPUProcessor::get_current_gpu_proc(void) { @@ -2757,7 +2776,9 @@ namespace Realm { CUcontext _context, int num_streams) : module(_module), info(_info), worker(_worker) - , proc(0), fbmem(0), context(_context), fbmem_base(0), next_stream(0) + , proc(0), fbmem(0), fb_ibmem(0) + , context(_context), fbmem_base(0), fb_ibmem_base(0) + , next_stream(0) { push_context(); @@ -2813,6 +2834,9 @@ namespace Realm { if(fbmem_base) CHECK_CU( CUDA_DRIVER_FNPTR(cuMemFree)(fbmem_base) ); + if(fb_ibmem_base) + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemFree)(fb_ibmem_base) ); + CHECK_CU( CUDA_DRIVER_FNPTR(cuDevicePrimaryCtxRelease)(info->device) ); } @@ -2902,6 +2926,9 @@ namespace Realm { log_gpu.info() << "peer access enabled from GPU " << p << " to FB " << (*it)->fbmem->me; peer_fbs.insert((*it)->fbmem->me); + if((*it)->fb_ibmem) + peer_fbs.insert((*it)->fb_ibmem->me); + { Machine::ProcessorMemoryAffinity pma; pma.p = p; @@ -2913,7 +2940,7 @@ namespace Realm { } } - void GPU::create_fb_memory(RuntimeImpl *runtime, size_t size) + void GPU::create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size) { // need the context so we can get an allocation in the right place { @@ -2944,6 +2971,39 @@ namespace Realm { Memory m = runtime->next_local_memory_id(); fbmem = new GPUFBMemory(m, this, fbmem_base, size); runtime->add_memory(fbmem); + + // FB ibmem is a separate allocation for now (consider merging to make + // total number of allocations, network registrations, etc. smaller?) + if(ib_size > 0) { + { + AutoGPUContext agc(this); + + CUresult ret = CUDA_DRIVER_FNPTR(cuMemAlloc)(&fb_ibmem_base, ib_size); + if(ret != CUDA_SUCCESS) { + if(ret == CUDA_ERROR_OUT_OF_MEMORY) { + size_t free_bytes, total_bytes; + CHECK_CU( CUDA_DRIVER_FNPTR(cuMemGetInfo) + (&free_bytes, &total_bytes) ); + log_gpu.fatal() << "insufficient memory on gpu " << info->index + << ": " << ib_size << " bytes needed (from -ll:ib_fsize), " + << free_bytes << " (out of " << total_bytes << ") available"; + } else { + const char *errstring = "error message not available"; +#if CUDA_VERSION >= 6050 + CUDA_DRIVER_FNPTR(cuGetErrorName)(ret, &errstring); +#endif + log_gpu.fatal() << "unexpected error from cuMemAlloc on gpu " << info->index + << ": result=" << ret + << " (" << errstring << ")"; + } + abort(); + } + } + + Memory m = runtime->next_local_ib_memory_id(); + fb_ibmem = new GPUFBIBMemory(m, this, fb_ibmem_base, ib_size); + runtime->add_ib_memory(fb_ibmem); + } } #ifdef REALM_USE_CUDART_HIJACK @@ -3163,6 +3223,7 @@ namespace Realm { , cfg_zc_mem_size(64 << 20) , cfg_zc_ib_size(256 << 20) , cfg_fb_mem_size(256 << 20) + , cfg_fb_ib_size(128 << 20) , cfg_uvm_mem_size(0) , cfg_num_gpus(0) , cfg_gpu_streams(12) @@ -3317,6 +3378,7 @@ namespace Realm { cp.add_option_int_units("-ll:fsize", m->cfg_fb_mem_size, 'm') .add_option_int_units("-ll:zsize", m->cfg_zc_mem_size, 'm') + .add_option_int_units("-ll:ib_fsize", m->cfg_fb_ib_size, 'm') .add_option_int_units("-ll:ib_zsize", m->cfg_zc_ib_size, 'm') .add_option_int_units("-ll:msize", m->cfg_uvm_mem_size, 'm') .add_option_int("-ll:gpu", m->cfg_num_gpus) @@ -3616,7 +3678,7 @@ namespace Realm { for(std::vector::iterator it = gpus.begin(); it != gpus.end(); it++) - (*it)->create_fb_memory(runtime, cfg_fb_mem_size); + (*it)->create_fb_memory(runtime, cfg_fb_mem_size, cfg_fb_ib_size); // a single ZC memory for everybody if((cfg_zc_mem_size > 0) && !gpus.empty()) { diff --git a/runtime/realm/cuda/cuda_module.h b/runtime/realm/cuda/cuda_module.h index 50a87a9c4e..2840107a7e 100644 --- a/runtime/realm/cuda/cuda_module.h +++ b/runtime/realm/cuda/cuda_module.h @@ -70,7 +70,7 @@ namespace Realm { public: size_t cfg_zc_mem_size, cfg_zc_ib_size; - size_t cfg_fb_mem_size; + size_t cfg_fb_mem_size, cfg_fb_ib_size; size_t cfg_uvm_mem_size; unsigned cfg_num_gpus, cfg_gpu_streams; bool cfg_use_worker_threads, cfg_use_shared_worker, cfg_pin_sysmem; From 74136ee4d657342dec8c5e9bc1365c5978c7c703 Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Sun, 22 Aug 2021 06:53:02 -0700 Subject: [PATCH 27/36] realm: extra debugging for cuda memcpys --- runtime/realm/cuda/cuda_internal.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/runtime/realm/cuda/cuda_internal.cc b/runtime/realm/cuda/cuda_internal.cc index 718a29abf0..9b08b62fa0 100644 --- a/runtime/realm/cuda/cuda_internal.cc +++ b/runtime/realm/cuda/cuda_internal.cc @@ -641,6 +641,10 @@ namespace Realm { void GPUTransferCompletion::request_completed(void) { + log_gpudma.info() << "gpu memcpy complete: xd=" << std::hex << xd->guid << std::dec + << " read=" << read_port_idx << "/" << read_offset + << " write=" << write_port_idx << "/" << write_offset + << " bytes=" << write_size; if(read_port_idx >= 0) xd->update_bytes_read(read_port_idx, read_offset, read_size); if(write_port_idx >= 0) From fb85ee69a107d776853f51faea296cd4df37e4e5 Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Wed, 22 Sep 2021 18:38:01 -0700 Subject: [PATCH 28/36] realm: split cuda device-to-device copies across prioritized streams --- runtime/realm/cuda/cuda_internal.cc | 6 +-- runtime/realm/cuda/cuda_internal.h | 15 +++++-- runtime/realm/cuda/cuda_module.cc | 66 ++++++++++++++++++++--------- runtime/realm/cuda/cuda_module.h | 3 +- 4 files changed, 62 insertions(+), 28 deletions(-) diff --git a/runtime/realm/cuda/cuda_internal.cc b/runtime/realm/cuda/cuda_internal.cc index 9b08b62fa0..414014a86e 100644 --- a/runtime/realm/cuda/cuda_internal.cc +++ b/runtime/realm/cuda/cuda_internal.cc @@ -111,7 +111,7 @@ namespace Realm { GPUStream *stream; if(in_gpu) { if(out_gpu == in_gpu) - stream = in_gpu->device_to_device_stream; + stream = in_gpu->get_next_d2d_stream(); else if(!out_gpu) stream = in_gpu->device_to_host_stream; else { @@ -732,7 +732,7 @@ namespace Realm { uintptr_t out_base = reinterpret_cast(out_port->mem->get_direct_ptr(0, 0)); AutoGPUContext agc(channel->gpu); - GPUStream *stream = channel->gpu->device_to_device_stream; + GPUStream *stream = channel->gpu->get_next_d2d_stream(); while(total_bytes < max_bytes) { AddressListCursor& out_alc = out_port->addrcursor; @@ -1141,7 +1141,7 @@ namespace Realm { #endif #endif - stream = gpu->device_to_device_stream; + stream = gpu->get_next_d2d_stream(); } long GPUreduceXferDes::get_requests(Request** requests, long nr) diff --git a/runtime/realm/cuda/cuda_internal.h b/runtime/realm/cuda/cuda_internal.h index a5624a2a0d..ee26784ead 100644 --- a/runtime/realm/cuda/cuda_internal.h +++ b/runtime/realm/cuda/cuda_internal.h @@ -332,7 +332,7 @@ namespace Realm { // with when async work needs doing class GPUStream { public: - GPUStream(GPU *_gpu, GPUWorker *_worker); + GPUStream(GPU *_gpu, GPUWorker *_worker, int rel_priority = 0); ~GPUStream(void); GPU *get_gpu(void) const; @@ -493,8 +493,7 @@ namespace Realm { class GPU { public: GPU(CudaModule *_module, GPUInfo *_info, GPUWorker *worker, - CUcontext _context, - int num_streams); + CUcontext _context); ~GPU(void); void push_context(void); @@ -601,6 +600,7 @@ namespace Realm { GPUStream *find_stream(CUstream stream) const; GPUStream *get_null_task_stream(void) const; GPUStream *get_next_task_stream(bool create = false); + GPUStream *get_next_d2d_stream(); protected: CUmodule load_cuda_module(const void *data); @@ -628,12 +628,17 @@ namespace Realm { GPUStream *host_to_device_stream; GPUStream *device_to_host_stream; GPUStream *device_to_device_stream; + std::vector device_to_device_streams; std::vector peer_to_peer_streams; // indexed by target std::vector task_streams; - atomic next_stream; + atomic next_task_stream, next_d2d_stream; GPUEventPool event_pool; + // this can technically be different in each context (but probably isn't + // in practice) + int least_stream_priority, greatest_stream_priority; + #ifdef REALM_USE_CUDART_HIJACK std::map device_modules; std::map device_functions; @@ -1018,6 +1023,7 @@ namespace Realm { #define CUDA_DRIVER_APIS(__op__) \ __op__(cuCtxEnablePeerAccess); \ __op__(cuCtxGetFlags); \ + __op__(cuCtxGetStreamPriorityRange); \ __op__(cuCtxPopCurrent); \ __op__(cuCtxPushCurrent); \ __op__(cuCtxSynchronize); \ @@ -1062,6 +1068,7 @@ namespace Realm { __op__(cuModuleLoadDataEx); \ __op__(cuStreamAddCallback); \ __op__(cuStreamCreate); \ + __op__(cuStreamCreateWithPriority); \ __op__(cuStreamDestroy); \ __op__(cuStreamSynchronize); \ __op__(cuStreamWaitEvent) diff --git a/runtime/realm/cuda/cuda_module.cc b/runtime/realm/cuda/cuda_module.cc index 1b5c9696ec..2d3757ca2d 100644 --- a/runtime/realm/cuda/cuda_module.cc +++ b/runtime/realm/cuda/cuda_module.cc @@ -101,12 +101,23 @@ namespace Realm { // // class GPUStream - GPUStream::GPUStream(GPU *_gpu, GPUWorker *_worker) + GPUStream::GPUStream(GPU *_gpu, GPUWorker *_worker, + int rel_priority /*= 0*/) : gpu(_gpu), worker(_worker), issuing_copies(false) { assert(worker != 0); - CHECK_CU( CUDA_DRIVER_FNPTR(cuStreamCreate)(&stream, CU_STREAM_NON_BLOCKING) ); - log_stream.info() << "CUDA stream " << stream << " created for GPU " << gpu; + // the math here is designed to balance the context's priority range + // around a relative priority of 0, favoring an extra negative (higher + // priority) option + int abs_priority = (gpu->greatest_stream_priority + + rel_priority + + ((gpu->least_stream_priority - + gpu->greatest_stream_priority + 1) / 2)); + // CUDA promises to clamp to the actual range, so we don't have to + CHECK_CU( CUDA_DRIVER_FNPTR(cuStreamCreateWithPriority) + (&stream, CU_STREAM_NON_BLOCKING, abs_priority) ); + log_stream.info() << "stream created: gpu=" << gpu + << " stream=" << stream << " priority=" << abs_priority; } GPUStream::~GPUStream(void) @@ -1994,12 +2005,19 @@ namespace Realm { ThreadLocal::created_gpu_streams->insert(ThreadLocal::current_gpu_stream); return ThreadLocal::current_gpu_stream; } - unsigned index = next_stream.fetch_add(1) % task_streams.size(); + unsigned index = next_task_stream.fetch_add(1) % task_streams.size(); GPUStream *result = task_streams[index]; if (create) ThreadLocal::created_gpu_streams->insert(result); return result; - } + } + + GPUStream *GPU::get_next_d2d_stream() + { + unsigned d2d_stream_index = (next_d2d_stream.fetch_add(1) % + module->cfg_d2d_streams); + return device_to_device_streams[d2d_stream_index]; + } void GPUProcessor::shutdown(void) { @@ -2773,20 +2791,26 @@ namespace Realm { // class GPU GPU::GPU(CudaModule *_module, GPUInfo *_info, GPUWorker *_worker, - CUcontext _context, - int num_streams) + CUcontext _context) : module(_module), info(_info), worker(_worker) , proc(0), fbmem(0), fb_ibmem(0) , context(_context), fbmem_base(0), fb_ibmem_base(0) - , next_stream(0) + , next_task_stream(0), next_d2d_stream(0) { push_context(); + CHECK_CU( CUDA_DRIVER_FNPTR(cuCtxGetStreamPriorityRange) + (&least_stream_priority, &greatest_stream_priority) ); + event_pool.init_pool(); host_to_device_stream = new GPUStream(this, worker); device_to_host_stream = new GPUStream(this, worker); - device_to_device_stream = new GPUStream(this, worker); + + device_to_device_streams.resize(module->cfg_d2d_streams, 0); + for(unsigned i = 0; i < module->cfg_d2d_streams; i++) + device_to_device_streams[i] = new GPUStream(this, worker, + module->cfg_d2d_stream_priority); // only create p2p streams for devices we can talk to peer_to_peer_streams.resize(module->gpu_info.size(), 0); @@ -2796,9 +2820,9 @@ namespace Realm { if(info->peers.count((*it)->device) != 0) peer_to_peer_streams[(*it)->index] = new GPUStream(this, worker); - task_streams.resize(num_streams); - for(int idx = 0; idx < num_streams; idx++) - task_streams[idx] = new GPUStream(this, worker); + task_streams.resize(module->cfg_task_streams); + for(unsigned i = 0; i < module->cfg_task_streams; i++) + task_streams[i] = new GPUStream(this, worker); pop_context(); @@ -2817,7 +2841,8 @@ namespace Realm { // destroy streams delete host_to_device_stream; delete device_to_host_stream; - delete device_to_device_stream; + + delete_container_contents(device_to_device_streams); for(std::vector::iterator it = peer_to_peer_streams.begin(); it != peer_to_peer_streams.end(); @@ -2825,10 +2850,7 @@ namespace Realm { if(*it) delete *it; - while(!task_streams.empty()) { - delete task_streams.back(); - task_streams.pop_back(); - } + delete_container_contents(task_streams); // free memory if(fbmem_base) @@ -3226,7 +3248,8 @@ namespace Realm { , cfg_fb_ib_size(128 << 20) , cfg_uvm_mem_size(0) , cfg_num_gpus(0) - , cfg_gpu_streams(12) + , cfg_task_streams(12) + , cfg_d2d_streams(4) , cfg_use_worker_threads(false) , cfg_use_shared_worker(true) , cfg_pin_sysmem(true) @@ -3239,6 +3262,7 @@ namespace Realm { , cfg_lmem_resize_to_max(false) , cfg_multithread_dma(false) , cfg_hostreg_limit(1 << 30) + , cfg_d2d_stream_priority(-1) , shared_worker(0), zcmem_cpu_base(0) , zcib_cpu_base(0), zcmem(0) , uvm_base(0), uvmmem(0) @@ -3382,7 +3406,9 @@ namespace Realm { .add_option_int_units("-ll:ib_zsize", m->cfg_zc_ib_size, 'm') .add_option_int_units("-ll:msize", m->cfg_uvm_mem_size, 'm') .add_option_int("-ll:gpu", m->cfg_num_gpus) - .add_option_int("-ll:streams", m->cfg_gpu_streams) + .add_option_int("-ll:streams", m->cfg_task_streams) + .add_option_int("-ll:d2d_streams", m->cfg_d2d_streams) + .add_option_int("-ll:d2d_priority", m->cfg_d2d_stream_priority) .add_option_int("-ll:gpuworkthread", m->cfg_use_worker_threads) .add_option_int("-ll:gpuworker", m->cfg_use_shared_worker) .add_option_int("-ll:pin", m->cfg_pin_sysmem) @@ -3652,7 +3678,7 @@ namespace Realm { worker->add_to_manager(&(runtime->bgwork)); } - GPU *g = new GPU(this, gpu_info[i], worker, context, cfg_gpu_streams); + GPU *g = new GPU(this, gpu_info[i], worker, context); if(!cfg_use_shared_worker) dedicated_workers[g] = worker; diff --git a/runtime/realm/cuda/cuda_module.h b/runtime/realm/cuda/cuda_module.h index 2840107a7e..74f5399e7c 100644 --- a/runtime/realm/cuda/cuda_module.h +++ b/runtime/realm/cuda/cuda_module.h @@ -72,7 +72,7 @@ namespace Realm { size_t cfg_zc_mem_size, cfg_zc_ib_size; size_t cfg_fb_mem_size, cfg_fb_ib_size; size_t cfg_uvm_mem_size; - unsigned cfg_num_gpus, cfg_gpu_streams; + unsigned cfg_num_gpus, cfg_task_streams, cfg_d2d_streams; bool cfg_use_worker_threads, cfg_use_shared_worker, cfg_pin_sysmem; bool cfg_fences_use_callbacks; bool cfg_suppress_hijack_warning; @@ -83,6 +83,7 @@ namespace Realm { bool cfg_lmem_resize_to_max; bool cfg_multithread_dma; size_t cfg_hostreg_limit; + int cfg_d2d_stream_priority; // "global" variables live here too GPUWorker *shared_worker; From 9259bed236e6a4efb58686ea789580f54bcb11ea Mon Sep 17 00:00:00 2001 From: Wei Wu Date: Fri, 6 Aug 2021 13:22:12 -0600 Subject: [PATCH 29/36] hip: ask channels to estimate cost of dma hops --- runtime/realm/hip/hip_internal.cc | 59 ++++++++++++++----------------- 1 file changed, 26 insertions(+), 33 deletions(-) diff --git a/runtime/realm/hip/hip_internal.cc b/runtime/realm/hip/hip_internal.cc index cf8740e171..514b9b0944 100644 --- a/runtime/realm/hip/hip_internal.cc +++ b/runtime/realm/hip/hip_internal.cc @@ -443,7 +443,7 @@ namespace Realm { stringbuilder() << "hip channel (gpu=" << _src_gpu->info->index << " kind=" << (int)_kind << ")") { src_gpu = _src_gpu; - + // switch out of ordered mode if multi-threaded dma is requested if(_src_gpu->module->cfg_multithread_dma) xdq.ordered_mode = false; @@ -453,38 +453,28 @@ namespace Realm { switch(_kind) { case XFER_GPU_TO_FB: { - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 10000; // HACK - estimate at 10 GB/s + unsigned latency = 1000; // HACK - estimate at 1 us + unsigned frag_overhead = 2000; // HACK - estimate at 2 us for(std::set::const_iterator it = src_gpu->pinned_sysmems.begin(); it != src_gpu->pinned_sysmems.end(); ++it) - add_path(*it, fbm, bw, latency, false, false, - XFER_GPU_TO_FB); - - // for(std::set::const_iterator it = src_gpu->managed_mems.begin(); - // it != src_gpu->managed_mems.end(); - // ++it) - // add_path(*it, fbm, bw, latency, false, false, - // XFER_GPU_TO_FB); + add_path(*it, fbm, bw, latency, frag_overhead, XFER_GPU_TO_FB) + .set_max_dim(2); // D->H cudamemcpy3d is unrolled into 2d copies break; } case XFER_GPU_FROM_FB: { - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 10000; // HACK - estimate at 10 GB/s + unsigned latency = 1000; // HACK - estimate at 1 us + unsigned frag_overhead = 2000; // HACK - estimate at 2 us for(std::set::const_iterator it = src_gpu->pinned_sysmems.begin(); it != src_gpu->pinned_sysmems.end(); ++it) - add_path(fbm, *it, bw, latency, false, false, - XFER_GPU_FROM_FB); - - // for(std::set::const_iterator it = src_gpu->managed_mems.begin(); - // it != src_gpu->managed_mems.end(); - // ++it) - // add_path(fbm, *it, bw, latency, false, false, - // XFER_GPU_FROM_FB); + add_path(fbm, *it, bw, latency, frag_overhead, XFER_GPU_FROM_FB) + .set_max_dim(2); // H->D cudamemcpy3d is unrolled into 2d copies break; } @@ -492,10 +482,11 @@ namespace Realm { case XFER_GPU_IN_FB: { // self-path - unsigned bw = 0; // TODO - unsigned latency = 0; - add_path(fbm, fbm, bw, latency, false, false, - XFER_GPU_IN_FB); + unsigned bw = 200000; // HACK - estimate at 200 GB/s + unsigned latency = 250; // HACK - estimate at 250 ns + unsigned frag_overhead = 2000; // HACK - estimate at 2 us + add_path(fbm, fbm, bw, latency, frag_overhead, XFER_GPU_IN_FB) + .set_max_dim(3); break; } @@ -503,13 +494,14 @@ namespace Realm { case XFER_GPU_PEER_FB: { // just do paths to peers - they'll do the other side - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 50000; // HACK - estimate at 50 GB/s + unsigned latency = 1000; // HACK - estimate at 1 us + unsigned frag_overhead = 2000; // HACK - estimate at 2 us for(std::set::const_iterator it = src_gpu->peer_fbs.begin(); it != src_gpu->peer_fbs.end(); ++it) - add_path(fbm, *it, bw, latency, false, false, - XFER_GPU_PEER_FB); + add_path(fbm, *it, bw, latency, frag_overhead, XFER_GPU_PEER_FB) + .set_max_dim(3); break; } @@ -946,11 +938,12 @@ namespace Realm { { Memory fbm = gpu->fbmem->me; - unsigned bw = 0; // TODO - unsigned latency = 0; + unsigned bw = 300000; // HACK - estimate at 300 GB/s + unsigned latency = 250; // HACK - estimate at 250 ns + unsigned frag_overhead = 2000; // HACK - estimate at 2 us - add_path(Memory::NO_MEMORY, fbm, - bw, latency, false, false, XFER_GPU_IN_FB); + add_path(Memory::NO_MEMORY, fbm, bw, latency, frag_overhead, XFER_GPU_IN_FB) + .set_max_dim(2); xdq.add_to_manager(bgwork); } From a67f0016f44804657f2d6e067a13fba11e2a1bd1 Mon Sep 17 00:00:00 2001 From: Wei Wu Date: Sat, 7 Aug 2021 20:37:28 -0600 Subject: [PATCH 30/36] hip: add dma ib mem in gpu fb with -ll:ib_fsize --- runtime/realm/hip/hip_internal.cc | 41 ++++++++++++--- runtime/realm/hip/hip_internal.h | 17 ++++++- runtime/realm/hip/hip_module.cc | 83 ++++++++++++++++++++++++++----- runtime/realm/hip/hip_module.h | 2 +- 4 files changed, 120 insertions(+), 23 deletions(-) diff --git a/runtime/realm/hip/hip_internal.cc b/runtime/realm/hip/hip_internal.cc index 514b9b0944..787df6b5a4 100644 --- a/runtime/realm/hip/hip_internal.cc +++ b/runtime/realm/hip/hip_internal.cc @@ -45,12 +45,16 @@ namespace Realm { src_gpus.resize(inputs_info.size(), 0); for(size_t i = 0; i < input_ports.size(); i++) if(input_ports[i].mem->kind == MemoryImpl::MKIND_GPUFB) - src_gpus[i] = checked_cast(input_ports[0].mem)->gpu; + src_gpus[i] = (ID(input_ports[i].mem->me).is_memory() ? + (checked_cast(input_ports[i].mem))->gpu : + (checked_cast(input_ports[i].mem))->gpu); dst_gpus.resize(outputs_info.size(), 0); for(size_t i = 0; i < output_ports.size(); i++) if(output_ports[i].mem->kind == MemoryImpl::MKIND_GPUFB) - dst_gpus[i] = checked_cast(output_ports[0].mem)->gpu; + dst_gpus[i] = (ID(output_ports[i].mem->me).is_memory() ? + (checked_cast(output_ports[i].mem))->gpu : + (checked_cast(output_ports[i].mem))->gpu); } long GPUXferDes::get_requests(Request** requests, long nr) @@ -92,7 +96,7 @@ namespace Realm { if(in_port != 0) { if(out_port != 0) { // input and output both exist - transfer what we can - log_xd.info() << "cuda memcpy chunk: min=" << min_xfer_size + log_xd.info() << "hip memcpy chunk: min=" << min_xfer_size << " max=" << max_bytes; uintptr_t in_base = reinterpret_cast(in_port->mem->get_direct_ptr(0, 0)); @@ -449,6 +453,7 @@ namespace Realm { xdq.ordered_mode = false; Memory fbm = src_gpu->fbmem->me; + Memory fbib = (src_gpu->fb_ibmem ? src_gpu->fb_ibmem->me : Memory::NO_MEMORY); switch(_kind) { case XFER_GPU_TO_FB: @@ -458,10 +463,15 @@ namespace Realm { unsigned frag_overhead = 2000; // HACK - estimate at 2 us for(std::set::const_iterator it = src_gpu->pinned_sysmems.begin(); it != src_gpu->pinned_sysmems.end(); - ++it) + ++it) { add_path(*it, fbm, bw, latency, frag_overhead, XFER_GPU_TO_FB) .set_max_dim(2); // D->H cudamemcpy3d is unrolled into 2d copies - + + if(fbib.exists()) + add_path(*it, fbib, bw, latency, frag_overhead, XFER_GPU_TO_FB) + .set_max_dim(2); // D->H cudamemcpy3d is unrolled into 2d copies + } + break; } @@ -472,9 +482,13 @@ namespace Realm { unsigned frag_overhead = 2000; // HACK - estimate at 2 us for(std::set::const_iterator it = src_gpu->pinned_sysmems.begin(); it != src_gpu->pinned_sysmems.end(); - ++it) + ++it) { add_path(fbm, *it, bw, latency, frag_overhead, XFER_GPU_FROM_FB) .set_max_dim(2); // H->D cudamemcpy3d is unrolled into 2d copies + if(fbib.exists()) + add_path(fbib, *it, bw, latency, frag_overhead, XFER_GPU_FROM_FB) + .set_max_dim(2); // H->D cudamemcpy3d is unrolled into 2d copies + } break; } @@ -487,6 +501,13 @@ namespace Realm { unsigned frag_overhead = 2000; // HACK - estimate at 2 us add_path(fbm, fbm, bw, latency, frag_overhead, XFER_GPU_IN_FB) .set_max_dim(3); + if(fbib.exists()) { + add_path(fbm, fbib, bw, latency, frag_overhead, XFER_GPU_IN_FB) + .set_max_dim(3); + add_path(fbib, fbm, bw, latency, frag_overhead, XFER_GPU_IN_FB) + .set_max_dim(3); + // TODO: do we need to add the self-path for the ibmem? + } break; } @@ -499,9 +520,13 @@ namespace Realm { unsigned frag_overhead = 2000; // HACK - estimate at 2 us for(std::set::const_iterator it = src_gpu->peer_fbs.begin(); it != src_gpu->peer_fbs.end(); - ++it) + ++it) { add_path(fbm, *it, bw, latency, frag_overhead, XFER_GPU_PEER_FB) - .set_max_dim(3); + .set_max_dim(3); + if(fbib.exists()) + add_path(fbib, *it, bw, latency, frag_overhead, XFER_GPU_PEER_FB) + .set_max_dim(3); + } break; } diff --git a/runtime/realm/hip/hip_internal.h b/runtime/realm/hip/hip_internal.h index 9f22cccd1b..f74714acd0 100644 --- a/runtime/realm/hip/hip_internal.h +++ b/runtime/realm/hip/hip_internal.h @@ -34,6 +34,7 @@ typedef char* hipDeviceCharptr_t; #include "realm/mem_impl.h" #include "realm/bgwork.h" #include "realm/transfer/channel.h" +#include "realm/transfer/ib_memory.h" #define CHECK_CUDART(cmd) do { \ hipError_t ret = (cmd); \ @@ -94,6 +95,7 @@ namespace Realm { class GPUStream; class GPUFBMemory; class GPUZCMemory; + class GPUFBIBMemory; class GPU; class HipModule; @@ -500,7 +502,7 @@ namespace Realm { #endif void create_processor(RuntimeImpl *runtime, size_t stack_size); - void create_fb_memory(RuntimeImpl *runtime, size_t size); + void create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size); void create_dma_channels(Realm::RuntimeImpl *r); @@ -600,10 +602,11 @@ namespace Realm { GPUWorker *worker; GPUProcessor *proc; GPUFBMemory *fbmem; + GPUFBIBMemory *fb_ibmem; //hipCtx_t context; int device_id; - hipDeviceCharptr_t fbmem_base; + hipDeviceCharptr_t fbmem_base, fb_ibmem_base; // which system memories have been registered and can be used for cuMemcpyAsync std::set pinned_sysmems; @@ -754,6 +757,16 @@ namespace Realm { NetworkSegment local_segment; }; + class GPUFBIBMemory : public IBMemory { + public: + GPUFBIBMemory(Memory _me, GPU *_gpu, hipDeviceCharptr_t _base, size_t _size); + + public: + GPU *gpu; + hipDeviceCharptr_t base; + NetworkSegment local_segment; + }; + class GPURequest; class GPUCompletionEvent : public GPUCompletionNotification { diff --git a/runtime/realm/hip/hip_module.cc b/runtime/realm/hip/hip_module.cc index cb92acb991..59151f1d30 100644 --- a/runtime/realm/hip/hip_module.cc +++ b/runtime/realm/hip/hip_module.cc @@ -2255,7 +2255,7 @@ namespace Realm { //////////////////////////////////////////////////////////////////////// // - // class GPU + // class GPUFBMemory GPUFBMemory::GPUFBMemory(Memory _me, GPU *_gpu, hipDeviceCharptr_t _base, size_t _size) : LocalManagedMemory(_me, _size, MKIND_GPUFB, 512, Memory::GPU_FB_MEM, 0) @@ -2323,6 +2323,25 @@ namespace Realm { { return (cpu_base + offset); } + + //////////////////////////////////////////////////////////////////////// + // + // class GPUFBIBMemory + + GPUFBIBMemory::GPUFBIBMemory(Memory _me, GPU *_gpu, + hipDeviceCharptr_t _base, size_t _size) + : IBMemory(_me, _size, MKIND_GPUFB, Memory::GPU_FB_MEM, + reinterpret_cast(_base), 0) + , gpu(_gpu) + , base(_base) + { + // advertise for potential gpudirect support + local_segment.assign(NetworkSegmentInfo::HipDeviceMem, + reinterpret_cast(_base), _size, + reinterpret_cast(_gpu)); + segment = &local_segment; + } + // Helper methods for emulating the cuda runtime /*static*/ GPUProcessor* GPUProcessor::get_current_gpu_proc(void) @@ -2668,7 +2687,9 @@ namespace Realm { int _device_id, int num_streams) : module(_module), info(_info), worker(_worker) - , proc(0), fbmem(0), device_id(_device_id), next_stream(0) + , proc(0), fbmem(0), fb_ibmem(0) + , device_id(_device_id), fbmem_base(0), fb_ibmem_base(0) + , next_stream(0) { push_context(); @@ -2721,7 +2742,11 @@ namespace Realm { } // free memory - CHECK_CU( hipFree((void *)fbmem_base) ); + if(fbmem_base) + CHECK_CU( hipFree((void *)fbmem_base) ); + + if(fb_ibmem_base) + CHECK_CU( hipFree((void *)fb_ibmem_base) ); //CHECK_CU( hipDevicePrimaryCtxRelease(info->device) ); } @@ -2790,6 +2815,9 @@ namespace Realm { } log_gpu.info() << "peer access enabled from GPU " << p << " to FB " << (*it)->fbmem->me; peer_fbs.insert((*it)->fbmem->me); + + if((*it)->fb_ibmem) + peer_fbs.insert((*it)->fb_ibmem->me); { Machine::ProcessorMemoryAffinity pma; @@ -2802,7 +2830,7 @@ namespace Realm { } } - void GPU::create_fb_memory(RuntimeImpl *runtime, size_t size) + void GPU::create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size) { // need the context so we can get an allocation in the right place { @@ -2816,16 +2844,14 @@ namespace Realm { size_t free_bytes, total_bytes; CHECK_CU( hipMemGetInfo(&free_bytes, &total_bytes) ); log_gpu.fatal() << "insufficient memory on gpu " << info->index - << ": " << size << " bytes needed (from -ll:fsize), " - << free_bytes << " (out of " << total_bytes << ") available"; + << ": " << size << " bytes needed (from -ll:fsize), " + << free_bytes << " (out of " << total_bytes << ") available"; } else { const char *errstring = "error message not available"; -#if HIP_VERBOSE_ERROR_MSG == 1 errstring = hipGetErrorName(ret); -#endif - log_gpu.fatal() << "unexpected error from cuMemAlloc on gpu " << info->index - << ": result=" << ret - << " (" << errstring << ")"; + log_gpu.fatal() << "unexpected error from hipMalloc on gpu " << info->index + << ": result=" << ret + << " (" << errstring << ")"; } abort(); } @@ -2834,6 +2860,37 @@ namespace Realm { Memory m = runtime->next_local_memory_id(); fbmem = new GPUFBMemory(m, this, (hipDeviceCharptr_t)fbmem_base, size); runtime->add_memory(fbmem); + + // FB ibmem is a separate allocation for now (consider merging to make + // total number of allocations, network registrations, etc. smaller?) + if(ib_size > 0) { + { + AutoGPUContext agc(this); + + hipError_t ret = hipMalloc((void **)&fb_ibmem_base, ib_size); + printf("ib hipmalloc %p, size %ld\n", (void *)fb_ibmem_base, ib_size); + if(ret != hipSuccess) { + if(ret == hipErrorMemoryAllocation) { + size_t free_bytes, total_bytes; + CHECK_CU( hipMemGetInfo(&free_bytes, &total_bytes) ); + log_gpu.fatal() << "insufficient memory on gpu " << info->index + << ": " << ib_size << " bytes needed (from -ll:ib_fsize), " + << free_bytes << " (out of " << total_bytes << ") available"; + } else { + const char *errstring = "error message not available"; + errstring = hipGetErrorName(ret); + log_gpu.fatal() << "unexpected error from hipMalloc on gpu " << info->index + << ": result=" << ret + << " (" << errstring << ")"; + } + abort(); + } + } + + Memory m = runtime->next_local_ib_memory_id(); + fb_ibmem = new GPUFBIBMemory(m, this, fb_ibmem_base, ib_size); + runtime->add_ib_memory(fb_ibmem); + } } #ifdef REALM_USE_HIP_HIJACK @@ -3007,6 +3064,7 @@ namespace Realm { , cfg_zc_mem_size(64 << 20) , cfg_zc_ib_size(256 << 20) , cfg_fb_mem_size(256 << 20) + , cfg_fb_ib_size(128 << 20) , cfg_num_gpus(0) , cfg_gpu_streams(12) , cfg_use_worker_threads(false) @@ -3048,6 +3106,7 @@ namespace Realm { cp.add_option_int_units("-ll:fsize", m->cfg_fb_mem_size, 'm') .add_option_int_units("-ll:zsize", m->cfg_zc_mem_size, 'm') + .add_option_int_units("-ll:ib_fsize", m->cfg_fb_ib_size, 'm') .add_option_int_units("-ll:ib_zsize", m->cfg_zc_ib_size, 'm') .add_option_int("-ll:gpu", m->cfg_num_gpus) .add_option_int("-ll:streams", m->cfg_gpu_streams) @@ -3321,7 +3380,7 @@ namespace Realm { for(std::vector::iterator it = gpus.begin(); it != gpus.end(); it++) - (*it)->create_fb_memory(runtime, cfg_fb_mem_size); + (*it)->create_fb_memory(runtime, cfg_fb_mem_size, cfg_fb_ib_size); // a single ZC memory for everybody if((cfg_zc_mem_size > 0) && !gpus.empty()) { diff --git a/runtime/realm/hip/hip_module.h b/runtime/realm/hip/hip_module.h index 556122b730..599315a4e1 100644 --- a/runtime/realm/hip/hip_module.h +++ b/runtime/realm/hip/hip_module.h @@ -71,7 +71,7 @@ namespace Realm { public: size_t cfg_zc_mem_size, cfg_zc_ib_size; - size_t cfg_fb_mem_size; + size_t cfg_fb_mem_size, cfg_fb_ib_size; unsigned cfg_num_gpus, cfg_gpu_streams; bool cfg_use_worker_threads, cfg_use_shared_worker, cfg_pin_sysmem; bool cfg_fences_use_callbacks; From dc822a18067f0fcb44b01062baa1ff5cf178e6dc Mon Sep 17 00:00:00 2001 From: Wei Wu Date: Thu, 23 Sep 2021 03:10:26 -0500 Subject: [PATCH 31/36] hip: extra debugging for cuda memcpys --- runtime/realm/hip/hip_internal.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/runtime/realm/hip/hip_internal.cc b/runtime/realm/hip/hip_internal.cc index 787df6b5a4..9d997000c2 100644 --- a/runtime/realm/hip/hip_internal.cc +++ b/runtime/realm/hip/hip_internal.cc @@ -596,6 +596,10 @@ namespace Realm { void GPUTransferCompletion::request_completed(void) { + log_gpudma.info() << "gpu memcpy complete: xd=" << std::hex << xd->guid << std::dec + << " read=" << read_port_idx << "/" << read_offset + << " write=" << write_port_idx << "/" << write_offset + << " bytes=" << write_size; if(read_port_idx >= 0) xd->update_bytes_read(read_port_idx, read_offset, read_size); if(write_port_idx >= 0) From 27ad25c4da51b5f7389609b1b1273dac7072c517 Mon Sep 17 00:00:00 2001 From: Wei Wu Date: Thu, 23 Sep 2021 02:12:07 -0500 Subject: [PATCH 32/36] hip: split hip device-to-device copies across prioritized streams --- runtime/realm/hip/hip_internal.cc | 4 +- runtime/realm/hip/hip_internal.h | 13 ++++-- runtime/realm/hip/hip_module.cc | 69 +++++++++++++++++++++---------- runtime/realm/hip/hip_module.h | 3 +- 4 files changed, 61 insertions(+), 28 deletions(-) diff --git a/runtime/realm/hip/hip_internal.cc b/runtime/realm/hip/hip_internal.cc index 9d997000c2..fdc7fbe424 100644 --- a/runtime/realm/hip/hip_internal.cc +++ b/runtime/realm/hip/hip_internal.cc @@ -106,7 +106,7 @@ namespace Realm { GPUStream *stream; if(in_gpu) { if(out_gpu == in_gpu) - stream = in_gpu->device_to_device_stream; + stream = in_gpu->get_next_d2d_stream(); else if(!out_gpu) stream = in_gpu->device_to_host_stream; else { @@ -687,7 +687,7 @@ namespace Realm { uintptr_t out_base = reinterpret_cast(out_port->mem->get_direct_ptr(0, 0)); AutoGPUContext agc(channel->gpu); - GPUStream *stream = channel->gpu->device_to_device_stream; + GPUStream *stream = channel->gpu->get_next_d2d_stream(); while(total_bytes < max_bytes) { AddressListCursor& out_alc = out_port->addrcursor; diff --git a/runtime/realm/hip/hip_internal.h b/runtime/realm/hip/hip_internal.h index f74714acd0..bc41c33ad1 100644 --- a/runtime/realm/hip/hip_internal.h +++ b/runtime/realm/hip/hip_internal.h @@ -323,7 +323,7 @@ namespace Realm { // with when async work needs doing class GPUStream { public: - GPUStream(GPU *_gpu, GPUWorker *_worker); + GPUStream(GPU *_gpu, GPUWorker *_worker, int rel_priority = 0); ~GPUStream(void); GPU *get_gpu(void) const; @@ -485,8 +485,7 @@ namespace Realm { class GPU { public: GPU(HipModule *_module, GPUInfo *_info, GPUWorker *worker, - int _device_id, - int num_streams); + int _device_id); ~GPU(void); void push_context(void); @@ -593,6 +592,7 @@ namespace Realm { GPUStream *find_stream(hipStream_t stream) const; GPUStream *get_null_task_stream(void) const; GPUStream *get_next_task_stream(bool create = false); + GPUStream *get_next_d2d_stream(); protected: hipModule_t load_hip_module(const void *data); @@ -618,12 +618,17 @@ namespace Realm { GPUStream *host_to_device_stream; GPUStream *device_to_host_stream; GPUStream *device_to_device_stream; + std::vector device_to_device_streams; std::vector peer_to_peer_streams; // indexed by target std::vector task_streams; - atomic next_stream; + atomic next_task_stream, next_d2d_stream; GPUEventPool event_pool; + // this can technically be different in each context (but probably isn't + // in practice) + int least_stream_priority, greatest_stream_priority; + #ifdef REALM_USE_HIP_HIJACK std::map device_modules; std::map device_functions; diff --git a/runtime/realm/hip/hip_module.cc b/runtime/realm/hip/hip_module.cc index 59151f1d30..508ac38e41 100644 --- a/runtime/realm/hip/hip_module.cc +++ b/runtime/realm/hip/hip_module.cc @@ -101,12 +101,24 @@ namespace Realm { // // class GPUStream - GPUStream::GPUStream(GPU *_gpu, GPUWorker *_worker) + GPUStream::GPUStream(GPU *_gpu, GPUWorker *_worker, + int rel_priority /*= 0*/) : gpu(_gpu), worker(_worker), issuing_copies(false) { assert(worker != 0); - CHECK_CU( hipStreamCreateWithFlags(&stream, hipStreamNonBlocking) ); - log_stream.info() << "HIP stream " << stream << " created for GPU " << gpu; + + // the math here is designed to balance the context's priority range + // around a relative priority of 0, favoring an extra negative (higher + // priority) option + int abs_priority = (gpu->greatest_stream_priority + + rel_priority + + ((gpu->least_stream_priority - + gpu->greatest_stream_priority + 1) / 2)); + // CUDA promises to clamp to the actual range, so we don't have to + CHECK_CU( hipStreamCreateWithPriority(&stream, hipStreamNonBlocking, + abs_priority) ); + log_stream.info() << "stream created: gpu=" << gpu + << " stream=" << stream << " priority=" << abs_priority; } GPUStream::~GPUStream(void) @@ -1991,13 +2003,20 @@ namespace Realm { ThreadLocal::created_gpu_streams->insert(ThreadLocal::current_gpu_stream); return ThreadLocal::current_gpu_stream; } - unsigned index = next_stream.fetch_add(1) % task_streams.size(); + unsigned index = next_task_stream.fetch_add(1) % task_streams.size(); GPUStream *result = task_streams[index]; if (create) ThreadLocal::created_gpu_streams->insert(result); return result; } + GPUStream *GPU::get_next_d2d_stream() + { + unsigned d2d_stream_index = (next_d2d_stream.fetch_add(1) % + module->cfg_d2d_streams); + return device_to_device_streams[d2d_stream_index]; + } + void GPUProcessor::shutdown(void) { log_gpu.info("shutting down"); @@ -2684,20 +2703,26 @@ namespace Realm { // class GPU GPU::GPU(HipModule *_module, GPUInfo *_info, GPUWorker *_worker, - int _device_id, - int num_streams) + int _device_id) : module(_module), info(_info), worker(_worker) , proc(0), fbmem(0), fb_ibmem(0) , device_id(_device_id), fbmem_base(0), fb_ibmem_base(0) - , next_stream(0) + , next_task_stream(0), next_d2d_stream(0) { push_context(); + CHECK_CU( hipDeviceGetStreamPriorityRange(&least_stream_priority, + &greatest_stream_priority) ); + event_pool.init_pool(); host_to_device_stream = new GPUStream(this, worker); device_to_host_stream = new GPUStream(this, worker); - device_to_device_stream = new GPUStream(this, worker); + + device_to_device_streams.resize(module->cfg_d2d_streams, 0); + for(unsigned i = 0; i < module->cfg_d2d_streams; i++) + device_to_device_streams[i] = new GPUStream(this, worker, + module->cfg_d2d_stream_priority); // only create p2p streams for devices we can talk to peer_to_peer_streams.resize(module->gpu_info.size(), 0); @@ -2707,9 +2732,9 @@ namespace Realm { if(info->peers.count((*it)->device) != 0) peer_to_peer_streams[(*it)->index] = new GPUStream(this, worker); - task_streams.resize(num_streams); - for(int idx = 0; idx < num_streams; idx++) - task_streams[idx] = new GPUStream(this, worker); + task_streams.resize(module->cfg_task_streams); + for(unsigned i = 0; i < module->cfg_task_streams; i++) + task_streams[i] = new GPUStream(this, worker); pop_context(); @@ -2728,18 +2753,16 @@ namespace Realm { // destroy streams delete host_to_device_stream; delete device_to_host_stream; - delete device_to_device_stream; - + + delete_container_contents(device_to_device_streams); + for(std::vector::iterator it = peer_to_peer_streams.begin(); it != peer_to_peer_streams.end(); ++it) if(*it) delete *it; - while(!task_streams.empty()) { - delete task_streams.back(); - task_streams.pop_back(); - } + delete_container_contents(task_streams); // free memory if(fbmem_base) @@ -3066,7 +3089,8 @@ namespace Realm { , cfg_fb_mem_size(256 << 20) , cfg_fb_ib_size(128 << 20) , cfg_num_gpus(0) - , cfg_gpu_streams(12) + , cfg_task_streams(12) + , cfg_d2d_streams(4) , cfg_use_worker_threads(false) , cfg_use_shared_worker(true) , cfg_pin_sysmem(true) @@ -3078,6 +3102,7 @@ namespace Realm { , cfg_max_ctxsync_threads(4) , cfg_multithread_dma(false) , cfg_hostreg_limit(1 << 30) + , cfg_d2d_stream_priority(-1) , shared_worker(0), zcmem_cpu_base(0) , zcib_cpu_base(0), zcmem(0) {} @@ -3109,8 +3134,10 @@ namespace Realm { .add_option_int_units("-ll:ib_fsize", m->cfg_fb_ib_size, 'm') .add_option_int_units("-ll:ib_zsize", m->cfg_zc_ib_size, 'm') .add_option_int("-ll:gpu", m->cfg_num_gpus) - .add_option_int("-ll:streams", m->cfg_gpu_streams) - .add_option_int("-ll:gpuworkthread", m->cfg_use_worker_threads) + .add_option_int("-ll:streams", m->cfg_task_streams) + .add_option_int("-ll:d2d_streams", m->cfg_d2d_streams) + .add_option_int("-ll:d2d_priority", m->cfg_d2d_stream_priority) + .add_option_int("-ll:gpuworkthread", m->cfg_use_worker_threads) .add_option_int("-ll:gpuworker", m->cfg_use_shared_worker) .add_option_int("-ll:pin", m->cfg_pin_sysmem) .add_option_bool("-cuda:callbacks", m->cfg_fences_use_callbacks) @@ -3354,7 +3381,7 @@ namespace Realm { worker->add_to_manager(&(runtime->bgwork)); } - GPU *g = new GPU(this, gpu_info[i], worker, i, cfg_gpu_streams); + GPU *g = new GPU(this, gpu_info[i], worker, i); if(!cfg_use_shared_worker) dedicated_workers[g] = worker; diff --git a/runtime/realm/hip/hip_module.h b/runtime/realm/hip/hip_module.h index 599315a4e1..68eefa5017 100644 --- a/runtime/realm/hip/hip_module.h +++ b/runtime/realm/hip/hip_module.h @@ -72,7 +72,7 @@ namespace Realm { public: size_t cfg_zc_mem_size, cfg_zc_ib_size; size_t cfg_fb_mem_size, cfg_fb_ib_size; - unsigned cfg_num_gpus, cfg_gpu_streams; + unsigned cfg_num_gpus, cfg_task_streams, cfg_d2d_streams; bool cfg_use_worker_threads, cfg_use_shared_worker, cfg_pin_sysmem; bool cfg_fences_use_callbacks; bool cfg_suppress_hijack_warning; @@ -82,6 +82,7 @@ namespace Realm { int cfg_max_ctxsync_threads; bool cfg_multithread_dma; size_t cfg_hostreg_limit; + int cfg_d2d_stream_priority; // "global" variables live here too GPUWorker *shared_worker; From e5146cea1a63799a6b846e8f1c0b8bfa2d92bae8 Mon Sep 17 00:00:00 2001 From: Mike Bauer Date: Thu, 16 Dec 2021 00:12:29 -0800 Subject: [PATCH 33/36] legion: more refactoring of distributed index space reference counting to keep index space nodes live until the application removes its references --- runtime/legion/legion_context.cc | 6 +- runtime/legion/legion_ops.cc | 3 +- runtime/legion/region_tree.cc | 205 ++++++++++++++++++++++--------- runtime/legion/region_tree.h | 32 ++++- runtime/legion/runtime.cc | 10 +- runtime/legion/runtime.h | 3 +- 6 files changed, 185 insertions(+), 74 deletions(-) diff --git a/runtime/legion/legion_context.cc b/runtime/legion/legion_context.cc index 7e4fc5daf2..b1f45f135d 100644 --- a/runtime/legion/legion_context.cc +++ b/runtime/legion/legion_context.cc @@ -993,7 +993,8 @@ namespace Legion { REPORT_LEGION_WARNING(LEGION_WARNING_LEAKED_RESOURCE, "Index space %x was leaked out of task tree rooted by task %s", it->first.id, get_task_name()) - runtime->forest->destroy_index_space(it->first, preconditions); + runtime->forest->destroy_index_space(it->first, + runtime->address_space, preconditions); } created_index_spaces.clear(); } @@ -11115,7 +11116,8 @@ namespace Legion { handle.id, get_task_name(), get_unique_id()); #endif std::set preconditions; - runtime->forest->destroy_index_space(handle, preconditions); + runtime->forest->destroy_index_space(handle, + runtime->address_space, preconditions); if (!preconditions.empty()) { AutoLock l_lock(leaf_lock); diff --git a/runtime/legion/legion_ops.cc b/runtime/legion/legion_ops.cc index fb49efc6dd..5e33ce7e98 100644 --- a/runtime/legion/legion_ops.cc +++ b/runtime/legion/legion_ops.cc @@ -8542,7 +8542,8 @@ namespace Legion { #ifdef DEBUG_LEGION assert(deletion_req_indexes.empty()); #endif - runtime->forest->destroy_index_space(index_space, preconditions); + runtime->forest->destroy_index_space(index_space, + runtime->address_space, preconditions); if (!sub_partitions.empty()) { for (std::vector::const_iterator it = diff --git a/runtime/legion/region_tree.cc b/runtime/legion/region_tree.cc index 493fcda22d..d6748f57d4 100644 --- a/runtime/legion/region_tree.cc +++ b/runtime/legion/region_tree.cc @@ -484,20 +484,23 @@ namespace Legion { //-------------------------------------------------------------------------- void RegionTreeForest::destroy_index_space(IndexSpace handle, - std::set &applied) + AddressSpaceID source, std::set &applied) //-------------------------------------------------------------------------- { - const AddressSpaceID owner_space = - IndexSpaceNode::get_owner_space(handle, runtime); - if (owner_space == runtime->address_space) + IndexSpaceNode *node = get_node(handle); + WrapperReferenceMutator mutator(applied); + if (node->is_owner()) { - IndexSpaceNode *node = get_node(handle); - WrapperReferenceMutator mutator(applied); + node->invalidate_root(source, applied); if (node->remove_base_valid_ref(APPLICATION_REF, &mutator)) delete node; } else - runtime->send_index_space_destruction(handle, owner_space, applied); + { + runtime->send_index_space_destruction(handle,node->owner_space,applied); + if (node->remove_base_valid_ref(REMOTE_DID_REF, &mutator)) + delete node; + } } //-------------------------------------------------------------------------- @@ -3511,7 +3514,7 @@ namespace Legion { ApEvent is_ready, IndexSpaceExprID expr_id, std::set *applied, - bool add_remote_reference, + bool add_root_reference, unsigned depth) //-------------------------------------------------------------------------- { @@ -3557,19 +3560,37 @@ namespace Legion { // If we are the root then the valid ref comes from the application // Otherwise the valid ref comes from parent partition if (!result->is_owner()) - { - // We only add this if requested - if (add_remote_reference) - result->add_base_gc_ref(REMOTE_DID_REF, &mutator); - } - else if (parent == NULL) - result->add_base_valid_ref(APPLICATION_REF, &mutator); - else - result->add_nested_valid_ref(parent->did, &mutator); + // Always add a base gc ref for all index spaces + result->add_base_gc_ref(REMOTE_DID_REF, &mutator); result->register_with_runtime(&mutator); if (parent != NULL) - parent->add_child(result); - } + { +#ifdef DEBUG_LEGION + assert(!add_root_reference); +#endif + // Always add a valid reference from the parent + result->add_nested_valid_ref(parent->did, &mutator); + // Check to see if the parent is still tree valid + if (!parent->add_child(result)) + { + result->invalidate_tree(); + // If the parent is tree invalid then remove the reference + result->remove_nested_valid_ref(parent->did, &mutator); + } + } + else + { + if (result->is_owner()) + { +#ifdef DEBUG_LEGION + assert(!add_root_reference); +#endif + result->add_base_valid_ref(APPLICATION_REF, &mutator); + } + else if (add_root_reference) + result->add_base_valid_ref(REMOTE_DID_REF, &mutator); + } + } if (local_initialized.exists()) { if (!local_applied.empty()) @@ -3644,13 +3665,26 @@ namespace Legion { // Otherwise the valid ref comes from parent partition if (!result->is_owner()) result->add_base_gc_ref(REMOTE_DID_REF, &mutator); - else if (parent == NULL) - result->add_base_valid_ref(APPLICATION_REF, &mutator); - else - result->add_nested_valid_ref(parent->did, &mutator); result->register_with_runtime(&mutator); if (parent != NULL) - parent->add_child(result); + { + // Always add a valid reference from the parent + result->add_nested_valid_ref(parent->did, &mutator); + // Check to see if the parent is still tree valid + if (!parent->add_child(result)) + { + result->invalidate_tree(); + // If the parent is tree invalid then remove the reference + result->remove_nested_valid_ref(parent->did, &mutator); + } + } + else + { +#ifdef DEBUG_LEGION + assert(result->is_owner()); +#endif + result->add_base_valid_ref(APPLICATION_REF, &mutator); + } } if (local_initialized.exists()) { @@ -7918,10 +7952,11 @@ namespace Legion { send_references((parent != NULL) ? 1 : 0), realm_index_space_set(Runtime::create_rt_user_event()), tight_index_space_set(Runtime::create_rt_user_event()), - tight_index_space(false), tree_valid(is_owner()) + tight_index_space(false), tree_valid(true), #ifdef DEBUG_LEGION - , tree_active(true) + tree_active(true), #endif + root_valid(parent == NULL) //-------------------------------------------------------------------------- { #ifdef DEBUG_LEGION @@ -8009,10 +8044,13 @@ namespace Legion { { if (is_owner()) { - AutoLock n_lock(node_lock); - // First time we become invalid then the tree is no longer valid - // Any later valid states are just for expression references - tree_valid = false; + if (parent == NULL) + { + // If we're a root index space node and this is the first time + // we have become invalid then the tree is no longer valid + AutoLock n_lock(node_lock); + tree_valid = false; + } } else send_remote_valid_decrement(owner_space, mutator, @@ -8673,11 +8711,12 @@ namespace Legion { // At this point we are the owner #ifdef DEBUG_LEGION assert(is_owner()); + assert(tree_active); #endif bool pack_space = false; bool still_valid = false; bool has_reference = false; - bool add_remote_reference = false; + bool add_root_reference = false; // Do our check to see if we're still valid { AutoLock n_lock(node_lock); @@ -8711,13 +8750,14 @@ namespace Legion { if (tree_valid && ((parent == NULL) || (send_references > 0))) { still_valid = true; - add_remote_reference = true; // Grab a reference on the parent to keep it from being deleted if (parent != NULL) { send_references++; has_reference = true; } + else if (root_valid) + add_root_reference = true; } else if (above) { @@ -8729,13 +8769,6 @@ namespace Legion { send_precondition = finder->second; return false; } - else if (tree_valid || (send_references > 0)) - { - // Technically this invalid, but we still need to send a - // remote reference because we haven't issued the invalidates - // yet so we need one to be there when it arrives - add_remote_reference = true; - } // Record this as an effect for when the node is no longer valid #ifdef DEBUG_LEGION assert(send_effects.find(target) == send_effects.end()); @@ -8777,7 +8810,7 @@ namespace Legion { } // Record that we're going to send this node nodes_to_send.emplace_back(SendNodeRecord(this, still_valid, - add_remote_reference, pack_space, has_reference)); + add_root_reference, pack_space, has_reference)); return still_valid; } @@ -8808,7 +8841,7 @@ namespace Legion { rez.serialize(expr_id); rez.serialize(initialized); rez.serialize(depth); - rez.serialize(record.add_remote_reference); + rez.serialize(record.add_root_reference); if (record.pack_space) pack_index_space(rez, true/*include size*/); else @@ -8836,18 +8869,61 @@ namespace Legion { } //-------------------------------------------------------------------------- - void IndexSpaceNode::remove_send_reference(void) + void IndexSpaceNode::invalidate_tree(void) //-------------------------------------------------------------------------- { +#ifdef DEBUG_LEGION + assert(parent != NULL); +#endif bool remove_reference; { AutoLock n_lock(node_lock); +#ifdef DEBUG_LEGION + assert(tree_valid); +#endif + tree_valid = false; remove_reference = (--send_references == 0); } if (remove_reference && parent->remove_nested_resource_ref(did)) delete parent; } + //-------------------------------------------------------------------------- + void IndexSpaceNode::InvalidateRootFunctor::apply(AddressSpaceID target) + //-------------------------------------------------------------------------- + { + if (target == source) + return; + std::map::const_iterator finder = + effects.find(target); +#ifdef DEBUG_LEGION + assert(finder != effects.end()); +#endif + node->send_remote_valid_decrement(target, &mutator, finder->second); + } + + //-------------------------------------------------------------------------- + void IndexSpaceNode::invalidate_root(AddressSpaceID source, + std::set &applied) + //-------------------------------------------------------------------------- + { +#ifdef DEBUG_LEGION + assert(is_owner()); +#endif + AutoLock n_lock(node_lock); +#ifdef DEBUG_LEGION + assert(root_valid); +#endif + root_valid = false; + if (has_remote_instances()) + { + WrapperReferenceMutator mutator(applied); + InvalidateRootFunctor functor(source, this, mutator, + runtime, send_effects); + map_over_remote_instances(functor); + } + } + //-------------------------------------------------------------------------- /*static*/ void IndexSpaceNode::handle_node_creation( RegionTreeForest *context, Deserializer &derez, AddressSpaceID source) @@ -8870,8 +8946,8 @@ namespace Legion { derez.deserialize(initialized); unsigned depth; derez.deserialize(depth); - bool is_remote_valid; - derez.deserialize(is_remote_valid); + bool add_root_reference; + derez.deserialize(add_root_reference); size_t index_space_size; derez.deserialize(index_space_size); const void *index_space_ptr = @@ -8883,7 +8959,7 @@ namespace Legion { true/*can fail*/, true/*local only*/); IndexSpaceNode *node = context->create_node(handle, index_space_ptr, false/*is domain*/, parent_node, color, did, initialized, - ready_event, expr_id, NULL/*applied*/, is_remote_valid, depth); + ready_event, expr_id, NULL/*applied*/, add_root_reference, depth); #ifdef DEBUG_LEGION assert(node != NULL); #endif @@ -9476,7 +9552,7 @@ namespace Legion { total_children(color_sp->get_volume()), max_linearized_color(color_sp->get_max_linearized_color()), partition_ready(part_ready), partial_pending(partial), disjoint(dis), - has_complete(comp >= 0), complete(comp != 0), tree_valid(is_owner()), + has_complete(comp >= 0), complete(comp != 0), tree_valid(true), send_count(0) //-------------------------------------------------------------------------- { @@ -9632,13 +9708,6 @@ namespace Legion { { if (is_owner()) { - { - AutoLock n_lock(node_lock); -#ifdef DEBUG_LEGION - assert(tree_valid); -#endif - tree_valid = false; - } // Remove gc references from our remote nodes if (has_remote_instances()) { @@ -9675,12 +9744,25 @@ namespace Legion { // We still hold resource references to the node so we don't need to // worry about the child nodes being deleted parent->remove_child(color); - for (std::map::const_iterator it = - color_map.begin(); it != color_map.end(); it++) + std::vector to_invalidate; { - it->second->remove_send_reference(); - if (it->second->is_owner()) - it->second->remove_nested_valid_ref(did, mutator); + AutoLock n_lock(node_lock); +#ifdef DEBUG_LEGION + assert(tree_valid); +#endif + tree_valid = false; + to_invalidate.reserve(color_map.size()); + for (std::map::const_iterator it = + color_map.begin(); it != color_map.end(); it++) + to_invalidate.push_back(it->second); + } + for (std::vector::const_iterator it = + to_invalidate.begin(); it != to_invalidate.end(); it++) + { + (*it)->invalidate_tree(); + // Remove the nested valid reference on this index space node + if ((*it)->remove_nested_valid_ref(did, mutator)) + assert(false); // still holding resource ref so should never be hit } if (!partition_trackers.empty()) { @@ -10023,14 +10105,16 @@ namespace Legion { } //-------------------------------------------------------------------------- - void IndexPartNode::add_child(IndexSpaceNode *child) + bool IndexPartNode::add_child(IndexSpaceNode *child) //-------------------------------------------------------------------------- { // This child should live as long as we are alive child->add_nested_resource_ref(did); RtUserEvent to_trigger; + bool result; { AutoLock n_lock(node_lock); + result = tree_valid; #ifdef DEBUG_LEGION assert(color_map.find(child->color) == color_map.end()); #endif @@ -10038,11 +10122,12 @@ namespace Legion { std::map::iterator finder = pending_child_map.find(child->color); if (finder == pending_child_map.end()) - return; + return result; to_trigger = finder->second; pending_child_map.erase(finder); } Runtime::trigger_event(to_trigger); + return result; } //-------------------------------------------------------------------------- diff --git a/runtime/legion/region_tree.h b/runtime/legion/region_tree.h index 261ae001e7..658ba6676d 100644 --- a/runtime/legion/region_tree.h +++ b/runtime/legion/region_tree.h @@ -207,7 +207,7 @@ namespace Legion { std::set &safe_events); void compute_partition_disjointness(IndexPartition handle, RtUserEvent ready_event); - void destroy_index_space(IndexSpace handle, + void destroy_index_space(IndexSpace handle, AddressSpaceID source, std::set &preconditions); void destroy_index_partition(IndexPartition handle, std::set &preconditions); @@ -668,7 +668,7 @@ namespace Legion { ApEvent is_ready = ApEvent::NO_AP_EVENT, IndexSpaceExprID expr_id = 0, std::set *applied = NULL, - bool add_remote_reference = true, + bool add_root_reference = false, unsigned depth = UINT_MAX); IndexSpaceNode* create_node(IndexSpace is, const void *realm_is, IndexPartNode *par, LegionColor color, @@ -1861,12 +1861,12 @@ namespace Legion { public: SendNodeRecord(IndexTreeNode *n, bool valid = false, bool add = false, bool pack = false, bool has_ref = false) - : node(n), still_valid(valid), add_remote_reference(add), + : node(n), still_valid(valid), add_root_reference(add), pack_space(pack), has_reference(has_ref) { } public: IndexTreeNode *node; bool still_valid; - bool add_remote_reference; + bool add_root_reference; bool pack_space; bool has_reference; }; @@ -2003,6 +2003,21 @@ namespace Legion { ReferenceMutator *const mutator; std::map &send_effects; }; + class InvalidateRootFunctor { + public: + InvalidateRootFunctor(AddressSpaceID src, IndexSpaceNode *n, + ReferenceMutator &m, Runtime *rt, + const std::map &e) + : source(src), node(n), runtime(rt), mutator(m), effects(e) { } + public: + void apply(AddressSpaceID target); + public: + const AddressSpaceID source; + IndexSpaceNode *const node; + Runtime *const runtime; + ReferenceMutator &mutator; + const std::map &effects; + }; public: IndexSpaceNode(RegionTreeForest *ctx, IndexSpace handle, IndexPartNode *parent, LegionColor color, @@ -2065,7 +2080,9 @@ namespace Legion { const bool above = false); virtual void pack_node(Serializer &rez, AddressSpaceID target, const SendNodeRecord &record); - void remove_send_reference(void); + void invalidate_tree(void); + void invalidate_root(AddressSpaceID source, + std::set &applied); static void handle_node_creation(RegionTreeForest *context, Deserializer &derez, AddressSpaceID source); @@ -2273,6 +2290,9 @@ namespace Legion { // Keep track of whether we are active, should only happen once bool tree_active; #endif + // Keep track of whether we've had our application + // reference removed if this is a root node + bool root_valid; }; /** @@ -2960,7 +2980,7 @@ namespace Legion { public: bool has_color(const LegionColor c); IndexSpaceNode* get_child(const LegionColor c, RtEvent *defer = NULL); - void add_child(IndexSpaceNode *child); + bool add_child(IndexSpaceNode *child); void add_tracker(PartitionTracker *tracker); size_t get_num_children(void) const; void get_subspace_preconditions(std::set &preconditions); diff --git a/runtime/legion/runtime.cc b/runtime/legion/runtime.cc index 18ba843816..b38d53bd92 100644 --- a/runtime/legion/runtime.cc +++ b/runtime/legion/runtime.cc @@ -8303,7 +8303,8 @@ namespace Legion { } case INDEX_SPACE_DESTRUCTION_MESSAGE: { - runtime->handle_index_space_destruction(derez); + runtime->handle_index_space_destruction(derez, + remote_address_space); break; } case INDEX_PARTITION_DESTRUCTION_MESSAGE: @@ -18061,7 +18062,8 @@ namespace Legion { } //-------------------------------------------------------------------------- - void Runtime::handle_index_space_destruction(Deserializer &derez) + void Runtime::handle_index_space_destruction(Deserializer &derez, + AddressSpaceID source) //-------------------------------------------------------------------------- { DerezCheck z(derez); @@ -18073,7 +18075,7 @@ namespace Legion { assert(done.exists()); #endif std::set applied; - forest->destroy_index_space(handle, applied); + forest->destroy_index_space(handle, source, applied); if (!applied.empty()) Runtime::trigger_event(done, Runtime::merge_events(applied)); else @@ -20369,7 +20371,7 @@ namespace Legion { std::set applied; for (std::map,IndexSpace>::const_iterator it = index_slice_spaces.begin(); it != index_slice_spaces.end(); it++) - forest->destroy_index_space(it->second, applied); + forest->destroy_index_space(it->second, address_space, applied); // If there are still any layout constraints that the application // failed to remove its references to then we can remove the reference // for them and make sure it's effects propagate diff --git a/runtime/legion/runtime.h b/runtime/legion/runtime.h index e9097c7f3f..f2ca2f1135 100644 --- a/runtime/legion/runtime.h +++ b/runtime/legion/runtime.h @@ -2520,7 +2520,8 @@ namespace Legion { AddressSpaceID source); void handle_top_level_region_return(Deserializer &derez, AddressSpaceID source); - void handle_index_space_destruction(Deserializer &derez); + void handle_index_space_destruction(Deserializer &derez, + AddressSpaceID source); void handle_index_partition_destruction(Deserializer &derez); void handle_field_space_destruction(Deserializer &derez); void handle_logical_region_destruction(Deserializer &derez); From e40015903234d486f8f9f4a3a198524cbe7aba62 Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Fri, 10 Dec 2021 08:29:57 -0800 Subject: [PATCH 34/36] realm: make sure rma put completions are pushed --- runtime/realm/gasnetex/gasnetex_internal.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/realm/gasnetex/gasnetex_internal.cc b/runtime/realm/gasnetex/gasnetex_internal.cc index 85dc1d817b..f1babf430c 100644 --- a/runtime/realm/gasnetex/gasnetex_internal.cc +++ b/runtime/realm/gasnetex/gasnetex_internal.cc @@ -1711,7 +1711,7 @@ namespace Realm { // no more completion replies, but do pushing if there are // ready packets ncomps = 0; - do_push = has_ready_packets; + do_push = has_ready_packets || !put_head.load(); } } } else { From cf4f5e7ddc071c78875f3b9225f533c6c83fe5d0 Mon Sep 17 00:00:00 2001 From: Mike Bauer Date: Fri, 17 Dec 2021 15:53:27 -0800 Subject: [PATCH 35/36] legion: cleanup of distributed expressions --- runtime/legion/garbage_collection.cc | 75 ---------------------------- runtime/legion/garbage_collection.h | 21 +------- runtime/legion/legion_analysis.cc | 6 +-- runtime/legion/legion_context.cc | 2 +- runtime/legion/legion_instances.cc | 8 +-- runtime/legion/legion_ops.h | 3 +- runtime/legion/legion_tasks.cc | 2 +- runtime/legion/region_tree.cc | 40 +++++++-------- runtime/legion/region_tree.h | 2 +- runtime/legion/runtime.cc | 8 +-- 10 files changed, 36 insertions(+), 131 deletions(-) diff --git a/runtime/legion/garbage_collection.cc b/runtime/legion/garbage_collection.cc index d3b185827b..9c060ef24b 100644 --- a/runtime/legion/garbage_collection.cc +++ b/runtime/legion/garbage_collection.cc @@ -31,7 +31,6 @@ namespace Legion { //-------------------------------------------------------------------------- LocalReferenceMutator::LocalReferenceMutator( const LocalReferenceMutator &rhs) - : waiter(rhs.waiter) //-------------------------------------------------------------------------- { // should never be called @@ -44,9 +43,6 @@ namespace Legion { { if (!mutation_effects.empty()) { -#ifdef DEBUG_LEGION - assert(waiter); -#endif RtEvent wait_on = Runtime::merge_events(mutation_effects); wait_on.wait(); } @@ -73,9 +69,6 @@ namespace Legion { RtEvent LocalReferenceMutator::get_done_event(void) //-------------------------------------------------------------------------- { -#ifdef DEBUG_LEGION - assert(!waiter); -#endif if (mutation_effects.empty()) return RtEvent::NO_RT_EVENT; RtEvent result = Runtime::merge_events(mutation_effects); @@ -1821,23 +1814,6 @@ namespace Legion { #ifdef DEBUG_LEGION assert(count != 0); assert(registered_with_runtime); -#endif -#if 0 - // If there is no mutator or it is a non-waiting mutator then we - // can buffer this up in the implicit reference tracker and send it - // at the end of the runtime call or meta-task - if ((mutator == NULL) || !mutator->is_waiting_mutator()) - { - if (implicit_reference_tracker == NULL) - implicit_reference_tracker = new ImplicitReferenceTracker; - const RtEvent send_event = - implicit_reference_tracker->record_valid_increment(did, target, - precondition, count); - if (mutator != NULL) - mutator->record_reference_mutation_effect( - implicit_reference_tracker->get_effects_event()); - return send_event; - } #endif RtUserEvent done_event; if (mutator != NULL) @@ -1875,23 +1851,6 @@ namespace Legion { assert(count != 0); assert(registered_with_runtime); #endif -#if 0 - // If there is no mutator or it is a non-waiting mutator then we - // can buffer this up in the implicit reference tracker and send it - // at the end of the runtime call or meta-task - if ((mutator == NULL) || !mutator->is_waiting_mutator()) - { - if (implicit_reference_tracker == NULL) - implicit_reference_tracker = new ImplicitReferenceTracker; - const RtEvent send_event = - implicit_reference_tracker->record_valid_decrement(did, target, - precondition, count); - if (mutator != NULL) - mutator->record_reference_mutation_effect( - implicit_reference_tracker->get_effects_event()); - return send_event; - } -#endif RtUserEvent done_event; if (mutator != NULL) { @@ -1927,23 +1886,6 @@ namespace Legion { #ifdef DEBUG_LEGION assert(count != 0); assert(registered_with_runtime); -#endif -#if 0 - // If there is no mutator or it is a non-waiting mutator then we - // can buffer this up in the implicit reference tracker and send it - // at the end of the runtime call or meta-task - if ((mutator == NULL) || !mutator->is_waiting_mutator()) - { - if (implicit_reference_tracker == NULL) - implicit_reference_tracker = new ImplicitReferenceTracker; - const RtEvent send_event = - implicit_reference_tracker->record_gc_increment(did, target, - precondition, count); - if (mutator != NULL) - mutator->record_reference_mutation_effect( - implicit_reference_tracker->get_effects_event()); - return send_event; - } #endif RtUserEvent done_event; if (mutator != NULL) @@ -1980,23 +1922,6 @@ namespace Legion { #ifdef DEBUG_LEGION assert(count != 0); assert(registered_with_runtime); -#endif -#if 0 - // If there is no mutator or it is a non-waiting mutator then we - // can buffer this up in the implicit reference tracker and send it - // at the end of the runtime call or meta-task - if ((mutator == NULL) || !mutator->is_waiting_mutator()) - { - if (implicit_reference_tracker == NULL) - implicit_reference_tracker = new ImplicitReferenceTracker; - const RtEvent send_event = - implicit_reference_tracker->record_gc_increment(did, target, - precondition, count); - if (mutator != NULL) - mutator->record_reference_mutation_effect( - implicit_reference_tracker->get_effects_event()); - return send_event; - } #endif RtUserEvent done_event; if (mutator != NULL) diff --git a/runtime/legion/garbage_collection.h b/runtime/legion/garbage_collection.h index ae2c09445d..7ac4fa1e79 100644 --- a/runtime/legion/garbage_collection.h +++ b/runtime/legion/garbage_collection.h @@ -182,7 +182,6 @@ namespace Legion { */ class ReferenceMutator { public: - virtual bool is_waiting_mutator(void) const = 0; virtual void record_reference_mutation_effect(RtEvent event) = 0; }; @@ -194,19 +193,17 @@ namespace Legion { */ class LocalReferenceMutator : public ReferenceMutator { public: - LocalReferenceMutator(bool wait) : waiter(wait) { } + LocalReferenceMutator(void) { } LocalReferenceMutator(const LocalReferenceMutator &rhs); ~LocalReferenceMutator(void); public: LocalReferenceMutator& operator=(const LocalReferenceMutator &rhs); public: - virtual bool is_waiting_mutator(void) const { return waiter; } virtual void record_reference_mutation_effect(RtEvent event); public: RtEvent get_done_event(void); private: std::set mutation_effects; - const bool waiter; }; /** @@ -222,7 +219,6 @@ namespace Legion { public: WrapperReferenceMutator& operator=(const WrapperReferenceMutator &rhs); public: - virtual bool is_waiting_mutator(void) const { return false; } virtual void record_reference_mutation_effect(RtEvent event); private: std::set &mutation_effects; @@ -245,21 +241,6 @@ namespace Legion { public: inline void record_live_expression(IndexSpaceExpression *expr) { live_expressions.emplace_back(expr); } -#if 0 - public: - RtEvent record_valid_increment(DistributedID did, - AddressSpaceID target, - unsigned count); - RtEvent record_valid_decrement(DistributedID did, - AddressSpaceID target, - unsigned count); - RtEvent record_gc_increment(DistributedID did, - AddressSpaceID target, - unsigned count); - RtEvent record_gc_decrement(DistributedID did, - AddressSpaceID target, - unsigned count); -#endif private: std::vector live_expressions; }; diff --git a/runtime/legion/legion_analysis.cc b/runtime/legion/legion_analysis.cc index ac0b07211b..b5c1a82c3a 100644 --- a/runtime/legion/legion_analysis.cc +++ b/runtime/legion/legion_analysis.cc @@ -10081,7 +10081,7 @@ namespace Legion { if (is_logical_owner() || initial_refinement) { // We're the owner so we can do the merge - LocalReferenceMutator mutator(false/*waiter*/); + LocalReferenceMutator mutator; for (FieldMaskSet::const_iterator it = new_views.begin(); it != new_views.end(); it++) if (valid_instances.insert(it->first, it->second)) @@ -10610,7 +10610,7 @@ namespace Legion { transition_event = RtUserEvent::NO_RT_USER_EVENT; } eq_state = MAPPING_STATE; - LocalReferenceMutator mutator(false/*waiter*/); + LocalReferenceMutator mutator; // Add references to all the views that we've loaded for (FieldMaskSet::const_iterator it = valid_instances.begin(); it != valid_instances.end(); it++) @@ -13885,7 +13885,7 @@ namespace Legion { const RemoteRefTaskArgs *rargs = (const RemoteRefTaskArgs*)args; if (rargs->done_event.exists()) { - LocalReferenceMutator mutator(false/*waiter*/); + LocalReferenceMutator mutator; if (rargs->add_references) { for (std::map::const_iterator it = diff --git a/runtime/legion/legion_context.cc b/runtime/legion/legion_context.cc index b1f45f135d..5f45e2b67c 100644 --- a/runtime/legion/legion_context.cc +++ b/runtime/legion/legion_context.cc @@ -5786,7 +5786,7 @@ namespace Legion { const DistributedID did = runtime->get_available_distributed_id(); FutureMapImpl *impl = new FutureMapImpl(this, runtime, did, runtime->address_space, RtEvent::NO_RT_EVENT); - LocalReferenceMutator mutator(true/*waiter*/); + LocalReferenceMutator mutator; for (std::map::const_iterator it = data.begin(); it != data.end(); it++) { diff --git a/runtime/legion/legion_instances.cc b/runtime/legion/legion_instances.cc index f2dffa957d..a6bf8ee726 100644 --- a/runtime/legion/legion_instances.cc +++ b/runtime/legion/legion_instances.cc @@ -3093,7 +3093,7 @@ namespace Legion { { RtUserEvent to_trigger; derez.deserialize(to_trigger); - LocalReferenceMutator mutator(false/*waiter*/);; + LocalReferenceMutator mutator; manager->activate_collective(&mutator); Runtime::trigger_event(to_trigger, mutator.get_done_event()); break; @@ -3102,7 +3102,7 @@ namespace Legion { { RtUserEvent to_trigger; derez.deserialize(to_trigger); - LocalReferenceMutator mutator(false/*waiter*/); + LocalReferenceMutator mutator; manager->deactivate_collective(&mutator); Runtime::trigger_event(to_trigger, mutator.get_done_event()); break; @@ -3111,7 +3111,7 @@ namespace Legion { { RtUserEvent to_trigger; derez.deserialize(to_trigger); - LocalReferenceMutator mutator(false/*waiter*/); + LocalReferenceMutator mutator; manager->validate_collective(&mutator); Runtime::trigger_event(to_trigger, mutator.get_done_event()); break; @@ -3120,7 +3120,7 @@ namespace Legion { { RtUserEvent to_trigger; derez.deserialize(to_trigger); - LocalReferenceMutator mutator(false/*waiter*/); + LocalReferenceMutator mutator; manager->invalidate_collective(&mutator); Runtime::trigger_event(to_trigger, mutator.get_done_event()); break; diff --git a/runtime/legion/legion_ops.h b/runtime/legion/legion_ops.h index 23fc1e8ea2..4f61311fc0 100644 --- a/runtime/legion/legion_ops.h +++ b/runtime/legion/legion_ops.h @@ -348,7 +348,6 @@ namespace Legion { const std::vector *dependences = NULL); public: // Inherited from ReferenceMutator - virtual bool is_waiting_mutator(void) const { return false; } virtual void record_reference_mutation_effect(RtEvent event); public: RtEvent execute_prepipeline_stage(GenerationID gen, @@ -618,7 +617,7 @@ namespace Legion { protected: static inline void add_launch_space_reference(IndexSpaceNode *node) { - LocalReferenceMutator mutator(true/*waiter*/); + LocalReferenceMutator mutator; node->add_base_valid_ref(CONTEXT_REF, &mutator); } static inline bool remove_launch_space_reference(IndexSpaceNode *node) diff --git a/runtime/legion/legion_tasks.cc b/runtime/legion/legion_tasks.cc index 7d7b4f3345..e7b3d69192 100644 --- a/runtime/legion/legion_tasks.cc +++ b/runtime/legion/legion_tasks.cc @@ -9473,7 +9473,7 @@ namespace Legion { #ifdef DEBUG_LEGION assert(finder != future_handles->handles.end()); #endif - LocalReferenceMutator mutator(false/*not waiting*/); + LocalReferenceMutator mutator; FutureImpl *impl = runtime->find_or_create_future(finder->second, parent_ctx->get_context_uid(), &mutator); if (functor != NULL) diff --git a/runtime/legion/region_tree.cc b/runtime/legion/region_tree.cc index d6748f57d4..2fbc07543d 100644 --- a/runtime/legion/region_tree.cc +++ b/runtime/legion/region_tree.cc @@ -5665,7 +5665,7 @@ namespace Legion { // Add the live reference if (mutator == NULL) { - LocalReferenceMutator local_mutator(true/*waiter*/); + LocalReferenceMutator local_mutator; result->add_base_expression_reference(LIVE_EXPR_REF, &local_mutator); } else @@ -5705,7 +5705,7 @@ namespace Legion { } if (expressions.empty()) return *(exprs.begin()); - LocalReferenceMutator local_mutator(true/*waiter*/); + LocalReferenceMutator local_mutator; if (expressions.size() == 1) { IndexSpaceExpression *result = expressions.back(); @@ -5984,7 +5984,7 @@ namespace Legion { // Add the live reference if (mutator == NULL) { - LocalReferenceMutator local_mutator(true/*waiter*/); + LocalReferenceMutator local_mutator; result->add_base_expression_reference(LIVE_EXPR_REF, &local_mutator); } else @@ -6028,7 +6028,7 @@ namespace Legion { // remove duplicates std::vector::iterator last = std::unique(expressions.begin(), expressions.end()); - LocalReferenceMutator local_mutator(true/*waiter*/); + LocalReferenceMutator local_mutator; if (last != expressions.end()) { expressions.erase(last, expressions.end()); @@ -6383,7 +6383,7 @@ namespace Legion { } if (mutator == NULL) { - LocalReferenceMutator local_mutator(true/*waiter*/); + LocalReferenceMutator local_mutator; result->add_base_expression_reference(LIVE_EXPR_REF, &local_mutator); } else @@ -6607,7 +6607,7 @@ namespace Legion { if (pending.is_index_space) { IndexSpaceNode *node = get_node(pending.handle); - LocalReferenceMutator mutator(false/*waiter*/); + LocalReferenceMutator mutator; node->add_base_expression_reference(LIVE_EXPR_REF, &mutator); const RtEvent added = mutator.get_done_event(); // Special case here: if the source was the owner and we didn't @@ -6643,7 +6643,7 @@ namespace Legion { #else IndexSpaceOperation *op = static_cast(result); #endif - LocalReferenceMutator mutator(false/*waiter*/); + LocalReferenceMutator mutator; result->add_base_expression_reference(LIVE_EXPR_REF, &mutator); const RtEvent added = mutator.get_done_event(); // Special case here: if the source was the owner and we didn't @@ -6914,7 +6914,7 @@ namespace Legion { #endif // Make this valid and then send the removal of the // remote did expression - LocalReferenceMutator mutator(false/*waiter*/); + LocalReferenceMutator mutator; op->add_base_expression_reference(LIVE_EXPR_REF, &mutator); // Always need to send this reference removal back immediately // in order to avoid reference counting deadlock @@ -6934,7 +6934,7 @@ namespace Legion { IndexSpace handle; derez.deserialize(handle); IndexSpaceNode *node = forest->get_node(handle); - LocalReferenceMutator mutator(false/*waiter*/); + LocalReferenceMutator mutator; node->add_base_expression_reference(LIVE_EXPR_REF, &mutator); const RtEvent added = mutator.get_done_event(); // Special case here: if the source was the owner and we didn't @@ -6966,7 +6966,7 @@ namespace Legion { #else IndexSpaceOperation *op = static_cast(result); #endif - LocalReferenceMutator mutator(false/*waiter*/); + LocalReferenceMutator mutator; result->add_base_expression_reference(LIVE_EXPR_REF, &mutator); const RtEvent added = mutator.get_done_event(); // Special case here: if the source was the owner and we didn't @@ -7010,7 +7010,7 @@ namespace Legion { #endif // Make this valid and then send the removal of the // remote did expression - LocalReferenceMutator mutator(false/*waiter*/); + LocalReferenceMutator mutator; op->add_base_expression_reference(LIVE_EXPR_REF, &mutator); // Always need to send this reference removal back immediately // in order to avoid reference counting deadlock @@ -7033,7 +7033,7 @@ namespace Legion { pending.source = source; return node; } - LocalReferenceMutator mutator(false/*waiter*/); + LocalReferenceMutator mutator; node->add_base_expression_reference(LIVE_EXPR_REF, &mutator); const RtEvent added = mutator.get_done_event(); // Special case here: if the source was the owner and we didn't @@ -7068,7 +7068,7 @@ namespace Legion { #else IndexSpaceOperation *op = static_cast(result); #endif - LocalReferenceMutator mutator(false/*waiter*/); + LocalReferenceMutator mutator; result->add_base_expression_reference(LIVE_EXPR_REF, &mutator); const RtEvent added = mutator.get_done_event(); // Special case here: if the source was the owner and we didn't @@ -7264,7 +7264,7 @@ namespace Legion { { if (mutator == NULL) { - LocalReferenceMutator local_mutator(true/*waiter*/); + LocalReferenceMutator local_mutator; add_base_valid_ref(source, &local_mutator, count); } else @@ -7287,7 +7287,7 @@ namespace Legion { { if (mutator == NULL) { - LocalReferenceMutator local_mutator(true/*waiter*/); + LocalReferenceMutator local_mutator; add_nested_valid_ref(source, &local_mutator, count); } else @@ -8789,7 +8789,7 @@ namespace Legion { // If this is above then we don't care about it if it // is not still valid bool remove_reference = false; - LocalReferenceMutator mutator(true/*waiter*/); + LocalReferenceMutator mutator; if (has_reference) { AutoLock n_lock(node_lock); @@ -8823,7 +8823,7 @@ namespace Legion { assert(record.node == this); #endif bool remove_reference = false; - LocalReferenceMutator mutator(true/*waiter*/); + LocalReferenceMutator mutator; { AutoLock n_lock(node_lock); { @@ -9281,7 +9281,7 @@ namespace Legion { // This could be a performance bug since it will block if we // have to send a reference to a remote node, but that should // never actually happen - LocalReferenceMutator mutator(true/*waiter*/); + LocalReferenceMutator mutator; add_base_gc_ref(REMOTE_DID_REF, &mutator); } @@ -9320,7 +9320,7 @@ namespace Legion { { if (mutator == NULL) { - LocalReferenceMutator local_mutator(true/*waiter*/); + LocalReferenceMutator local_mutator; add_base_valid_ref(source, &local_mutator, count); } else @@ -9343,7 +9343,7 @@ namespace Legion { { if (mutator == NULL) { - LocalReferenceMutator local_mutator(true/*waiter*/); + LocalReferenceMutator local_mutator; add_nested_valid_ref(source, &local_mutator, count); } else diff --git a/runtime/legion/region_tree.h b/runtime/legion/region_tree.h index 658ba6676d..4739bb69a2 100644 --- a/runtime/legion/region_tree.h +++ b/runtime/legion/region_tree.h @@ -1379,7 +1379,7 @@ namespace Legion { { if (m == NULL) { - LocalReferenceMutator local_mutator(true/*waiting*/); + LocalReferenceMutator local_mutator; expr->add_base_expression_reference(LIVE_EXPR_REF, &local_mutator); } else diff --git a/runtime/legion/runtime.cc b/runtime/legion/runtime.cc index b38d53bd92..f7ad941ae1 100644 --- a/runtime/legion/runtime.cc +++ b/runtime/legion/runtime.cc @@ -5254,7 +5254,7 @@ namespace Legion { bool remove_duplicate = false; if (success.load()) { - LocalReferenceMutator local_mutator(true/*waiter*/); + LocalReferenceMutator local_mutator; // Add our local reference manager->add_base_valid_ref(NEVER_GC_REF, &local_mutator); const RtEvent reference_effects = local_mutator.get_done_event(); @@ -5969,7 +5969,7 @@ namespace Legion { // and then remove the remote DID if (acquire) { - LocalReferenceMutator local_mutator(false/*waiter*/); + LocalReferenceMutator local_mutator; manager->add_base_valid_ref(MAPPING_ACQUIRE_REF, &local_mutator); const RtEvent reference_effects = local_mutator.get_done_event(); manager->send_remote_valid_decrement(source, NULL, @@ -6103,7 +6103,7 @@ namespace Legion { // and then remove the remote DID if (acquire) { - LocalReferenceMutator local_mutator(true/*waiter*/); + LocalReferenceMutator local_mutator; manager->add_base_valid_ref(MAPPING_ACQUIRE_REF, &local_mutator); const RtEvent reference_effects = local_mutator.get_done_event(); manager->send_remote_valid_decrement(source, NULL, @@ -6323,7 +6323,7 @@ namespace Legion { (*target)[index] = true; PhysicalManager *manager; derez.deserialize(manager); - LocalReferenceMutator local_mutator(false/*waiter*/); + LocalReferenceMutator local_mutator; manager->add_base_valid_ref(MAPPING_ACQUIRE_REF, &local_mutator); const RtEvent reference_effects = local_mutator.get_done_event(); manager->send_remote_valid_decrement(source, NULL, reference_effects); From 088791e996c0e1d6d8d5f22a865730c7368020ed Mon Sep 17 00:00:00 2001 From: Sean Treichler Date: Fri, 17 Dec 2021 16:32:28 -0800 Subject: [PATCH 36/36] beginning of CHANGES.txt for 21.12 release --- CHANGES.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CHANGES.txt b/CHANGES.txt index 19f3855721..05fd8d2608 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,6 +1,19 @@ This file lists the major changes as they appear in the stable branch. No attempt is made to keep this list accurate for the master branch. +Version 21.12.0 (December ??, 2021) + * Realm + - Performance improvements for multi-dimensional copies, especially + inter-process transfers + - Support for loading CUDA driver (if present) at runtime instead of + link time, allowing same binary to be used on systems with and without + CUDA-capable GPUs (enabled with -DLegion_CUDA_DYNAMIC_LOAD=ON in + cmake build) + * Build + - Cmake allows control of max nodes (-DLegion_MAX_NUM_NODES=...) and + max processors/node (-DLegion_MAX_NUM_PROCS=...) supported by + Legion build + Version 21.09.0 (September 28, 2021) * Realm - Numerous bug fixes in the `gasnetex` network layer