Code:
#!/bin/sh
# exon.sh gene_regions_file coding_exons_file
#
# This script reads the gene_regions_file and the coding_exons_file and
# produces output that classifies each line in the coding_exons_file as "exon",
# "intron", or "splicing".
#
# In addition, this script optionally prints debugging information showing:
# 1. the minimum and maximum values found for each line read from the
# gene_regions_file along with the field 1 and 4 values found on the
# corresponding line and the number of lines seen with those field 1 and
# 4 values,
# 2. the line number and contents of each line read from the
# coding_exons_file, and
# 3. while classifying an entry in the coding_exons_file; print the minimum
# and maximum values, the sequence number of the minimum and maximum
# values for the current entry's field 1 and 4 value; and the field 2
# value of the current entry.
#
# An earlier version of this script turned on debugging output if any arguments
# were given to the script when it was invoked (by using awk -v d=$#). This
# worked then because the filenames used by that version of the script were
# constants and built into the script rather than passed in as operands (as in
# this version of the script). Since this version of the script now requires
# two operands, the same results could be achieved by using
# awk -v d=$(($# > 2)). But, instead of doing that, this version of the
# script allows the user to set the debugging flag to a non-zero, non-null
# value before a file pathname operand if debugging printouts for that file
# are desired:
# No debugging output:
# exon.sh gene_regions_file coding_exons_file
# Enable debugging output for both input files:
# exon.sh d=1 gene_regions_file coding_exons_file
# Enable debugging output only for 2nd input file:
# exon.sh gene_regions_file d=1 coding_exons_file
# Enable debugging output only for 1st input file:
# exon.sh d=1 gene_regions_file d= coding_exons_file
# or:
# exon.sh d=1 gene_regions_file d=0 coding_exons_file
IAm=${0##*/} # Save final component of the pathname of this script for
# diagnostic messages.
if [ $# -lt 2 ]
then # Print a diagnostic usage message and exit if we don't have at least
# two arguments (the required input files). (As explained above,
# additional operands to enable or disable debugging may be included.
# Verification of there being two file operands and only appropiate
# debugging operands is left as an exercise for the reader. For now,
# assume that diagnostics from awk will be sufficient if inappropriate
# arguments are supplied.)
echo "Usage: $IAm gene-regions_file coding_exons_file" >&2
exit 1
fi
awk '
BEGIN { FS = "[\t_]" # Set input field separators to <tab> and <underscore>.
OFS = "\t" # Set output field separator to <tab>.
}
FNR == NR {
# When the current file record # is the same as the current record #
# from all files, we are looking at a record from the 1st input file...
# First, increment the number of entries seen for this pair of field 1
# & 4 values (c[$1, $4]) and then save the minimum value found in field
# 2 in this entry. Adding 0 converts an alphanumeric field to a
# numeric field by discarding any trailing non-numeric characters from
# the field.
m[$1, $4, ++c[$1, $4]] = $2 + 0
# And, then save the corresonding maximum value found in field 3 in
# this entry.
M[$1, $4, c[$1, $4]] = $3 + 0
# Note that this code will not work if the min-max ranges in the 1st
# input file are not in increasing order for each field 1 and 4 pair
# (although the entries for a given pair do not have to be adjacent).
# If debugging is enabled, print the minimum and maximum values saved
# from this entry.
if(d) printf("m[%s,%s,%d]=%s,M[%s,%s,%d]=%s\n",
$1, $4, c[$1, $4], m[$1, $4, c[$1, $4]],
$1, $4, c[$1, $4], M[$1, $4, c[$1, $4]])
# Skip the remaining steps in this script for this input record and
# continue with the next input record.
next
}
{ # If we are here, we have a record from the 2nd input file.
# If debugging is enabled, print the record number in this file and the
# contents of this record.
if(d) printf("FNR=%d:\"%s\"\n", FNR, $0)
# Loop through all of the entries for the field 1 & 4 pair values found
# in the current input record.
for(i = 1; i <= c[$1, $4]; i++) {
# If debugging is enabled, print the minimum and maximum values
# for this entry in the saved m[] and M[] arrays along with the
# field 2 value from the current input record.
if(d) printf("m[%d]=%d,M[%d]=%d,$2=%d\n",
i, m[$1, $4, i],
i, M[$1, $4, i],
$2)
# If the field 2 value in the current input record is in range
# for the entry being evaluated in this loop, set the 5th field
# in this input record to "exon" and break out of this loop.
if(m[$1, $4, i] <= $2 && $2 <= M[$1, $4, i]) {
$5 = "exon"
break
} else {
# If the minimum field 2 value in the current input
# record is greater than the numeric value of the field
# 2 value in the current input record...
if(m[$1, $4, i] > $2 + 0) {
# If the field 2 value in the current input
# record is within 10 of the low end of the
# range for the entry being evaluated in this
# iteration of the loop, set the 5th field in
# this input record to "splicing" and break
# out of this loop.
if(m[$1, $4, i] - 10 <= $2 + 0) {
$5 = "splicing"
break
} else {# Otherwise, set the 5th field in this
# input record to "intron" and break
# out of this loop.
$5 = "intron"
break
}
}
}
}
if(i > c[$1, $4])
# If we fell through the loop (instead of breaking out of it),
# we need to set the 5th field in this input record to
# "intron".
$5 = "intron"
}
1 # Print the updated current input record (with a field 5 value added).
' "$@" # Mark end of the awk script operand and add the operands provided to
# this script as operands to awk.