Code:
$ cat xmls.awk
BEGIN {
DEP=2; # How many close tags in a row before data dump
POS=0
RS="<";
FS="[ \n\t\t>/]";
}
# Always this finicky case when RS isn't \n
(NR==1) && (length($0) == 0) { next }
# Skip XML comments
/^!--/ {
while(!(I=index($0, "-->"))) if(getline <= 0) exit;
# Strip out comment
$0="--XMLCOMMENT-- />"substr($0,I+3);
}
# Ignore XML specification junk
/^\?/ || /^\!/ { next }
# Close tags
/^\// {
for(TPOS=POS; (TPOS>0) && (toupper($2) != TS[POS]); TPOS--);
if(TPOS <= 0) print "Went under for "$2
else
{
TPOS--;
while(TPOS < POS)
{
sub(/\/[^\/]*$/, "", TSS); POS--;
}
# printf("%s-%s\n", TSS, toupper($2));
}
# POP++;
# if(POP == DEP)
{
# printf("%d pops in a row\n", POP);
# for(X in A) delete A[X];
}
next
}
# These should be special variables for match() but aren't.
# String before match
function rbefore(STR) { return(substr(STR, N, RSTART-1)); }
# First char of match
function rmid(STR) { return(substr(STR, RSTART, 1)); }
# Entire match
function rall(STR) { return(substr(STR, RSTART, RLENGTH)); }
# String after match
function rafter(STR) { return(substr(STR, RSTART+RLENGTH)); }
# Turns Q=R into A[Q]=R
function aquote(OUT, A, TA) {
if(OUT)
{
split(OUT, TA, SUBSEP);
A[tolower(TA[1])]=TA[2];
}
return("");
}
# Intended to be less stupid about quoted text in XML/HTML.
# Splits a='b' c='d' e='f' into A[a]=b, A[c]=d, A[e]=f, etc.
function qsplit(STR, A, X, OUT) {
while(STR && match(STR, /([ \n\t]+)|[\x27\x22=]/))
{
OUT = OUT rbefore(STR);
RMID=rmid(STR);
if((RMID == "'") || (RMID == "\""))
{
if(!Q) Q=RMID;
else if(Q == RMID) Q="";
else OUT = OUT RMID;
} else if(RMID == "=") {
if(Q) OUT=OUT RMID; else OUT=OUT SUBSEP;
} else if((RMID=="\r")||(RMID=="\n")||(RMID=="\t")||(RMID==" ")) {
if(Q) OUT = OUT rall(STR);
else OUT = aquote(OUT, A);
}
STR=rafter(STR);
}
aquote(OUT STR, A);
}
# Non-close tag
!/^\// {
POP=0;
TAG=$1; sub(/^[^ \r\n\t]*/, "");
match($0, /\/?>/);
TDATA=rbefore($0); CDATA=rafter($0);
# Flatten and strip whitespace
gsub(/[ \r\n\t]+/, " ", CDATA);
gsub(/^[ \r\n\t]+/, "", CDATA);
gsub(/[ \r\n\t]+$/, "", CDATA);
if(RLENGTH != 2) # Found > instead of self-closing />
{
TS[++POS]=toupper(TAG);
# printf("%s+%s\n", TSS, toupper(TAG));
TSS=TSS"/"toupper(TAG);
}
for(X in TA) delete TA[X];
qsplit(TDATA, TA);
for(X in TA) A[X]=TA[X];
if(length(CDATA)) A["CDATA:"toupper(TAG)]=CDATA
# for(X in A) printf("%s[%s]=%s\n", TAG, X, A[X]);
}
(A["id"] == "popper_LaboratoryTestName") && (TS[POS]=="P") { print A["CDATA:P"] }
$ wget -q http://www.ncbi.nlm.nih.gov/gtr/tests/508680/ -O - | awk -f xmls.awk
Exome Sequencing (Exome)
$