With the same approach as bu5hman, i.e. assuming that the sample ID is the part of the filename up to the first dot:
#!/bin/sh
csv_print_row () {
# Outputs a CSV-formatted row of an arbitrary number of fields.
# Will quote fields containing commas. That's all.
for field do
case $field in
*,*) set -- "$@" "\"$field\"" ;;
*) set -- "$@" "$field"
esac
shift
done
# The fields are now (possibly quoted) in the list of positional parameters.
# Print this list as a comma-delimited string:
( IFS=,; printf "%s\n" "$*" )
}
# Output header
csv_print_row "sample_id" "absolute-filepath" "direction"
# Loop over the *.fq files in the current directory
for fastq in *.fq; do
# The sample ID is the filename up to the first dot.
sample_id=${fastq%%.*}
# Figure out the direction of the sample
case $fastq in
*.R1.*) dir=forward ;;
*.R2.*) dir=reverse ;;
*) dir=unknown
esac
# Output row for this sample
csv_print_row "$sample_id" "$PWD/$fastq" "$dir"
done
Testing:
$ ls -l
total 4
-rw-r--r-- 1 kk wheel 0 Mar 13 18:01 sample-1.R1.fq
-rw-r--r-- 1 kk wheel 0 Mar 13 18:01 sample-1.R2.fq
-rw-r--r-- 1 kk wheel 0 Mar 13 18:01 sample-2.R1.fq
-rw-r--r-- 1 kk wheel 0 Mar 13 18:01 sample-2.R2.fq
-rw-r--r-- 1 kk wheel 0 Mar 13 18:01 sample-3.R1.fq
-rw-r--r-- 1 kk wheel 0 Mar 13 18:01 sample-3.R2.fq
-rw-r--r-- 1 kk wheel 0 Mar 13 18:01 sample-4.R1.fq
-rw-r--r-- 1 kk wheel 0 Mar 13 18:01 sample-4.R2.fq
-rw-r--r-- 1 kk wheel 629 Mar 13 18:00 script.sh
-rw-r--r-- 1 kk wheel 0 Mar 13 18:02 strange, sample.R1.fq
-rw-r--r-- 1 kk wheel 0 Mar 13 18:02 strange, sample.R2.fq
-rw-r--r-- 1 kk wheel 0 Mar 13 18:02 strange, sample.R3.fq
$ sh script.sh
sample_id,absolute-filepath,direction
sample-1,/tmp/shell-yash.zm5cvzG6/sample-1.R1.fq,forward
sample-1,/tmp/shell-yash.zm5cvzG6/sample-1.R2.fq,reverse
sample-2,/tmp/shell-yash.zm5cvzG6/sample-2.R1.fq,forward
sample-2,/tmp/shell-yash.zm5cvzG6/sample-2.R2.fq,reverse
sample-3,/tmp/shell-yash.zm5cvzG6/sample-3.R1.fq,forward
sample-3,/tmp/shell-yash.zm5cvzG6/sample-3.R2.fq,reverse
sample-4,/tmp/shell-yash.zm5cvzG6/sample-4.R1.fq,forward
sample-4,/tmp/shell-yash.zm5cvzG6/sample-4.R2.fq,reverse
"strange, sample","/tmp/shell-yash.zm5cvzG6/strange, sample.R1.fq",forward
"strange, sample","/tmp/shell-yash.zm5cvzG6/strange, sample.R2.fq",reverse
"strange, sample","/tmp/shell-yash.zm5cvzG6/strange, sample.R3.fq",unknown
To create your manifest:
sh script.sh >manifest-file.csv
Note that this would generate invalid CSV output if any filename contains double quotes.
To properly handle the quoted fields that contain double quotes, you would have to use something like
csv_print_row () {
# Outputs a CSV-formatted row of an arbitrary number of fields.
# Quote fields that needs quoting
for field do
case $field in
*[,\"]*) set -- "$@" "\"$field\"" ;;
*) set -- "$@" "$field"
esac
shift
done
# Double up internal double quotes in fields that have been quoted
for field do
case $field in
'"'*'"'*'"')
field=$( printf '%s\n' "$field" | sed 's/"/""/g' )
# Now remove the extra quote at the start and end
field=${field%\"}
field=${field#\"}
esac
set -- "$@" "$field"
shift
done
( IFS=,; printf "%s\n" "$*" )
}
This still does not do the right thing for fields that contain newlines, but to handle that would bring us outside the scope of this question.
See also: