Code:
#!/bin/bash
awk -F, '
NR>1 {
split(tolower(substr($0, length($1","$2",")+1)), words, "[^A-Za-z\047]")
for(wnum in words) {
w=words[wnum]
if(length(w)>=4) {
counts[w]=counts[w]+1
freq[$1,w]++
}
}
}
END {
OFS=","
print "WORD,TOTFrequency,(1)Frequency,(0)Frequency,"\
"(-1)Frequency,(1%)Frequency,(0%)Frequency,(-1%)Frequency,NV"
for (w in counts) printf ("%s,%d,%d,%d,%d,%0.1f%%,%0.1f%%,%0.1f%%,%0.2f\n",
w, counts[w],
freq[1,w]+0, freq[0,w]+0, freq[-1,w]+0,
freq[1,w]*100/counts[w],
freq[0,w]*100/counts[w],
freq[-1,w]*100/counts[w],
(0+freq[1,w]+freq[-1,w])?
freq[1,w]-freq[0,1]/(freq[1,w]+freq[-1,w]):0) | "sort -t, -k2,2nr -k1,1"
}
' infile