Skip to content

Commit

Permalink
Include more countries, especially some big ones with force include
Browse files Browse the repository at this point in the history
Relax the location_min_seq_days as many important countries have quite some delay
but are still informative/important
  • Loading branch information
corneliusroemer committed Sep 27, 2023
1 parent 66b6d7f commit db423bb
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 9 deletions.
8 changes: 5 additions & 3 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,20 @@ prepare_data:
nextstrain_clades:
global:
included_days: 150
location_min_seq: 100
location_min_seq_days: 30
location_min_seq: 200
location_min_seq_days: 70
excluded_locations: "defaults/global_excluded_locations.txt"
included_locations: "defaults/global_included_locations.txt"
prune_seq_days: 12
clade_min_seq: 5000
clade_min_seq_days: 150
pango_lineages:
global:
included_days: 150
location_min_seq: 300
location_min_seq_days: 30
location_min_seq_days: 100
excluded_locations: "defaults/global_excluded_locations.txt"
included_locations: "defaults/global_included_locations.txt"
prune_seq_days: 12
clade_min_seq: 1
clade_min_seq_days: 150
Expand Down
5 changes: 5 additions & 0 deletions defaults/global_included_locations.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
India
South Africa
Brazil
Malaysia
Thailand
23 changes: 17 additions & 6 deletions scripts/prepare-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,23 +54,25 @@ def positive_int(value):
"This is useful to exclude sequence counts for recent days that are overly enriched for variants.")
parser.add_argument("--location-min-seq", type=positive_int, default=1,
help="The mininum number of sequences a location must have within the "
"days-min-seq to be included in analysis.\n"
"location-min-seq-days to be included in analysis.\n"
"(default: %(default)s)")
parser.add_argument("--location-min-seq-days", type=positive_int,
help="The number of days (counting back from the cutoff date) to use as the date range "
"for counting the number of sequences per location to determine if a location is included in analysis.\n"
"If not provided, will count sequences from all dates included in analysis date range.")
parser.add_argument("--excluded-locations",
help="File with a list locations to exclude from analysis.")
help="File with a list locations to always exclude from analysis.")
parser.add_argument("--included-locations",
help="File with a list locations to always include in analysis.")
parser.add_argument("--clade-min-seq", type=positive_int,
help="The minimum number of sequences a clades must have to be included as it's own variant.\n"
help="The minimum number of sequences a clades must have to be included as its own variant.\n"
"All clades with less than the minimum will be collapsed as 'other'.")
parser.add_argument("--clade-min-seq-days", type=positive_int,
help="The number fo days (counting back from the cutoff date) to use as the date range "
help="The number of days (counting back from the cutoff date) to use as the date range "
"for counting the number of sequences per clade to determine if a clade is included as its own variant.\n"
"If not provided, will count sequences from all dates included in analysis date range.")
parser.add_argument("--force-include-clades", nargs="*",
help="Clades to force include in the output regardless of sequences counts. " +
help="Clades to force include in the output regardless of sequence counts. " +
"Must be formatted as <clade_name>=<variant_name>")
parser.add_argument("--output-seq-counts", required=True,
help="Path to output TSV file for the prepared variants data.")
Expand Down Expand Up @@ -131,6 +133,7 @@ def positive_int(value):

# Get a set of locations that meet the location_min_seq requirement
locations_with_min_seq = set(seqs_per_location.loc[seqs_per_location['sequences'] >= args.location_min_seq, 'location'])
locations_with_min_tenth_seq = set(seqs_per_location.loc[seqs_per_location['sequences'] >= args.location_min_seq / 10, 'location'])

# Load manually annotated excluded locations if provided
excluded_locations = set()
Expand All @@ -140,8 +143,16 @@ def positive_int(value):

print(f"Excluding the following requested locations: {sorted(excluded_locations)}.")

# Load manually annotated excluded locations if provided
included_locations = set()
if args.included_locations:
with open(args.included_locations, 'r') as f:
included_locations = {line.rstrip() for line in f} & locations_with_min_tenth_seq

print(f"Including the following requested locations: {sorted(included_locations)}.")

# Remove excluded-locations from the set of locations to include in analysis
locations_to_include = locations_with_min_seq - excluded_locations
locations_to_include = locations_with_min_seq - excluded_locations | included_locations
print(f"Locations that will be included: {sorted(locations_to_include)}.")

assert len(locations_to_include) > 0, \
Expand Down
2 changes: 2 additions & 0 deletions workflow/snakemake_rules/prepare_data.smk
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ rule prepare_clade_data:
included_days = lambda wildcards: _get_prepare_data_option(wildcards, 'included_days'),
location_min_seq = lambda wildcards: _get_prepare_data_option(wildcards, 'location_min_seq'),
location_min_seq_days = lambda wildcards: _get_prepare_data_option(wildcards, 'location_min_seq_days'),
included_locations = lambda wildcards: _get_prepare_data_option(wildcards, 'included_locations'),
excluded_locations = lambda wildcards: _get_prepare_data_option(wildcards, 'excluded_locations'),
prune_seq_days = lambda wildcards: _get_prepare_data_option(wildcards, 'prune_seq_days'),
clade_min_seq = lambda wildcards: _get_prepare_data_option(wildcards, 'clade_min_seq'),
Expand All @@ -74,6 +75,7 @@ rule prepare_clade_data:
{params.location_min_seq} \
{params.location_min_seq_days} \
{params.excluded_locations} \
{params.included_locations} \
{params.prune_seq_days} \
{params.clade_min_seq} \
{params.clade_min_seq_days} \
Expand Down

0 comments on commit db423bb

Please sign in to comment.