Skip to content

Commit 528d303

Browse files
awh082834xonq
andauthored
[Basespace_Fetch] Extending grep -E to project ID track (#981)
* addition of grep -E to project_id search and new search params * making regex more robust * docs * strengthening separator check * condense to elif * doc clarifications * update description * rm unnecessary continue * separator selection to previous functionality --------- Co-authored-by: xonq <konkelzach@tuta.io>
1 parent 865e6ba commit 528d303

File tree

2 files changed

+23
-13
lines changed

2 files changed

+23
-13
lines changed

docs/workflows/data_import/basespace_fetch.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,9 @@ _If you already have a command-line environment available_, you can skip ahead t
145145
!!! warning "Sample_Name _and_ Sample_ID"
146146
If the Sample_Name and Sample_ID in the BaseSpace sample sheet are different, set the `basespace_sample_id` input attribute to "`this.basespace_sample_id"`.
147147

148+
!!! warning "Nested Samplenames"
149+
Erroneous matches may occur when a samplename is nested within another samplename in the same batch, separated by an underscore or space. This will occur when no lane suffix is present within the Basespace dataset. For example, "sample1" will retrieve "sample1_1", however, "sample1_L1" will NOT retrieve "sample1_1_L1". This should be taken into consideration when using certain naming conventions.
150+
148151
/// html | div[class="searchable-table"]
149152

150153
{{ render_tsv_table("docs/assets/tables/all_inputs.tsv", input_table=True, filters={"Workflow": "BaseSpace_Fetch"}, columns=["Terra Task Name", "Variable", "Type", "Description", "Default Value", "Terra Status"], sort_by=[("Terra Status", True), "Terra Task Name", "Variable"]) }}

tasks/utilities/data_import/task_basespace_cli.wdl

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,14 @@ task fetch_bs {
3939
#Grab BaseSpace Run_ID from given BaseSpace Run Name
4040
run_id=$(${bs_command} list run --retry | grep "~{basespace_collection_id}" | awk -F "|" '{ print $3 }' | awk '{$1=$1;print}' )
4141
echo "run_id: ${run_id}"
42+
43+
# NOTE: substring matching will occur when the data table does not append a suffix;
44+
# e.g. where "sample1" will retrieve "sample1_1", however, "sample1_L1" will NOT retrieve "sample1_1_L1"
45+
# This cannot be resolved without explicitly knowing the suffix prior to parsing. Noted in documentation
46+
4247
if [[ ! -z "${run_id}" ]]; then
4348
#Grab BaseSpace Dataset ID from dataset lists within given run
44-
dataset_id_array=($(${bs_command} list dataset --retry --input-run=${run_id} | grep -E "${dataset_name}([_]|$)" | awk -F "|" '{ print $3 }' ))
49+
dataset_id_array=($(${bs_command} list dataset --retry --input-run=${run_id} | grep -E "${dataset_name}(_| )[^_|^ ]* *\|[^\|]*\|[^\|]*\|[^\|]*\|" | awk -F "|" '{ print $3 }' ))
4550
echo "dataset_id: ${dataset_id_array[*]}"
4651
else
4752
#Try Grabbing BaseSpace Dataset ID from project name
@@ -50,7 +55,7 @@ task fetch_bs {
5055
echo "project_id: ${project_id}"
5156
if [[ ! -z "${project_id}" ]]; then
5257
echo "project_id identified via Basespace, now searching for dataset_id within project_id ${project_id}..."
53-
dataset_id_array=($(${bs_command} list dataset --retry --project-id=${project_id} | grep "${dataset_name}" | awk -F "|" '{ print $3 }' ))
58+
dataset_id_array=($(${bs_command} list dataset --retry --project-id=${project_id} | grep -E "${dataset_name}(_| )[^_|^ ]* *\|[^\|]*\|[^\|]*\|[^\|]*\|" | awk -F "|" '{ print $3 }' ))
5459
echo "dataset_id: ${dataset_id_array[*]}"
5560
else
5661
echo "No run or project id found associated with input basespace_collection_id: ~{basespace_collection_id}" >&2
@@ -72,21 +77,23 @@ task fetch_bs {
7277
# setting a new bash variable to use for renaming during concatenation of FASTQs
7378
for elm in ./dataset_${dataset_id}/*.fastq.gz; do
7479
echo "Checking Basespace file: $elm"
75-
if [[ $(echo "$elm" | cut -d '/' -f 3) =~ [-] && $sample_identifier =~ [_] ]]; then
76-
echo "Basespace sample name for $(echo "$elm" | cut -d '/' -f 3) contains dashes, input sample identifier $sample_identifier contains underscores, renaming identifier..."
77-
SAMPLENAME_RENAMED=$(echo $sample_identifier | sed 's|_|-|g' | sed 's|\.|-|g')
80+
filename=$(basename "$elm")
81+
82+
if [[ "$filename" =~ [-] && "$sample_identifier" =~ [_] ]]; then
83+
echo "Basespace sample name for $filename contains dashes, input sample identifier $sample_identifier contains underscores, renaming identifier..."
84+
SAMPLENAME_RENAMED=$(echo "$sample_identifier" | sed 's|_|-|g' | sed 's|\.|-|g')
7885
fi
79-
if [[ $(echo "$elm" | cut -d '/' -f 3) =~ [_] && $sample_identifier =~ [-] ]]; then
80-
echo "Basespace sample name for $(echo "$elm" | cut -d '/' -f 3) contains underscores, input sample identifier $sample_identifier contains dashes, renaming identifier..."
81-
SAMPLENAME_RENAMED=$(echo $sample_identifier | sed 's|-|_|g')
86+
if [[ "$filename" =~ [_] && "$sample_identifier" =~ [-] ]]; then
87+
echo "Basespace sample name for $filename contains underscores, input sample identifier $sample_identifier contains dashes, renaming identifier..."
88+
SAMPLENAME_RENAMED=$(echo "$sample_identifier" | sed 's|-|_|g')
8289
fi
83-
if [[ ($(echo "$elm" | cut -d '/' -f 3) =~ [_] && $sample_identifier =~ [_]) || ($(echo "$elm" | cut -d '/' -f 3) =~ [-] && $sample_identifier =~ [-]) ]]; then
84-
echo "Both Basespace sample name and input sample identifier for $(echo "$elm" | cut -d '/' -f 3) contain matching separators..."
85-
SAMPLENAME_RENAMED=$sample_identifier
90+
if [[ ("$filename" =~ [_] && "$sample_identifier" =~ [_]) || ("$filename" =~ [-] && "$sample_identifier" =~ [-]) ]]; then
91+
echo "Both Basespace sample name and input sample identifier for $filename contain matching separators..."
92+
SAMPLENAME_RENAMED="$sample_identifier"
8693
fi
87-
if [[ ! ($(echo "$elm" | cut -d '/' -f 3) =~ [_]) || ! ($(echo "$elm" | cut -d '/' -f 3) =~ [-]) ]]; then
94+
if [[ ! ("$filename" =~ [_]) || ! ("$filename" =~ [-]) ]]; then
8895
echo "Filename doesn't use underscore or hyphen separators, using input sample identifier as-is"
89-
SAMPLENAME_RENAMED=$sample_identifier
96+
SAMPLENAME_RENAMED="$sample_identifier"
9097
fi
9198
done
9299

0 commit comments

Comments
 (0)