Code:
$ cat allinput.awk
BEGIN {
FS=">"; OFS="\t"
RS="<";
# INPUTA, as in tag "input" attribute "a". They must be allcaps here.
split("INPUTA INPUTB A B C D E F G H I J K L M", ORDER, " ");
}
# These should be special variables for match() but aren't.
function rbefore(STR) { return(substr(STR, N, RSTART-1)); }# before match
function rmid(STR) { return(substr(STR, RSTART, 1)); } # First char match
function rall(STR) { return(substr(STR, RSTART, RLENGTH)); }# Entire match
function rafter(STR) { return(substr(STR, RSTART+RLENGTH)); }# after match
function aquote(OUT, A, PFIX, TA) { # Turns Q SUBSEP R into A[PFIX":"Q]=R
if(OUT)
{
if(PFIX) PFIX=PFIX":"
split(OUT, TA, SUBSEP);
A[toupper(PFIX) toupper(TA[1])]=TA[2];
}
return("");
}
# Intended to be less stupid about quoted text in XML/HTML.
# Splits a='b' c='d' e='f' into A[PFIX":"a]=b, A[PFIX":"c]=d, etc.
function qsplit(STR, A, PFIX, X, OUT) {
while(STR && match(STR, /([ \n\t]+)|[\x27\x22=]/))
{
OUT = OUT rbefore(STR);
RMID=rmid(STR);
if((RMID == "'") || (RMID == "\"")) # Quote characters
{
if(!Q) Q=RMID; # Begin quote section
else if(Q == RMID) Q=""; # End quote section
else OUT = OUT RMID; # Quoted quote
} else if(RMID == "=") {
if(Q) OUT=OUT RMID; else OUT=OUT SUBSEP;
} else if((RMID=="\r")||(RMID=="\n")||(RMID=="\t")||(RMID==" ")) {
if(Q) OUT = OUT rall(STR); # Literal quoted whitespace
else OUT = aquote(OUT, A, PFIX); # Unquoted WS, next block
}
STR=rafter(STR); # Strip off the text we've processed already.
}
aquote(OUT STR, A, PFIX); # Process any text we haven't already.
}
{ SPEC=0 ; TAG="" }
NR==1 {
if(ORS == RS) print;
next } # The first "line" is blank when RS=<
/^[!?]/ { SPEC=1 } # XML specification junk
# Handle open-tags
match($1, /^[^\/ \r\n\t>]+/) {
TAG=substr(toupper($1), RSTART, RLENGTH);
if((!SPEC) && !($1 ~ /\/$/))
{
TAGS=TAG "%" TAGS;
DEP++;
LTAGS=TAGS
}
for(X in ARGS) delete ARGS[X];
qsplit(rafter($1), ARGS);
}
# Handle close-tags
(!SPEC) && /^[\/]/ {
sub(/^\//, "", $1);
LTAGS=TAGS
# sub("^.*" toupper($1) "%", "", TAGS);
sub("^" toupper($1) "%", "", TAGS);
$1="/"$1
DEP=split(TAGS, TA, "%")-1;
if(DEP < 0) DEP=0;
}
### Example of how to use it ###
# TAG is the name of the last open-tag
# TAGS is an array of tag names like INNER%MIDDLE%OUTERMOST
# $2 is CDATA inside the current tag
# ARGS is an array of arguments for the current tag
#
# So, when processing <a> in <html><a href="index.html">Yay!</a></html>
# it would have:
# TAG="A"
# ARGS["HREF"]="index.html"
# TAGS="A%HTML"
# $2="Yay!"
# Handle <input> tag
(TAGS ~ /^INPUT%/) { for(X in ARGS) DATA[TAG X]=ARGS[X] }
# Parse <tags> inside <input> so DATA[TAGNAME]=CONTENTS
(TAGS ~ /(^|%)INPUT%/) && ($2 ~ /[^ \r\n\t]/) && !/^\// {
# Clean up tag contents
sub(/^[ \r\n]+/, "", $2);
sub(/[ \r\n]+$/, "", $2);
DATA[TAG]=$2
}
# Handle </input>, printing and clearing collected data
toupper($1) == "/INPUT" {
PFIX=""
for(M=1; M in ORDER; M++)
{
# Convert blank fields into single spaces, since the shell will see
# two tabs in a row as one field, skipping the blank one.
if(DATA[ORDER[M]]=="") DATA[ORDER[M]]=" "
printf("%s%s", PFIX, DATA[ORDER[M]]);
PFIX=OFS;
}
printf("\n");
for(X in DATA) delete DATA[X];
}
$ awk -f allinput.awk allinput.xml
2389906 install 111 222 333 444 C,D,E,G C,D,E,G 555
4732435 delete 999 792 990 942 992 C,D,G,H,I C,D,G,H,I 804
$ awk -f allinput.awk allinput.xml |
while IFS=$'\t' read INPUTA INPUTB A B C D E F G H I J K L M
do
# Convert all single-space fields into completely blank fields
for X in INPUTA INPUTB A B C D E F G H I J K L M
do
[ "${!X}" = " " ] && read $X # Cheeky trick to set arbitrary variable contents
done < /dev/null
echo "doing something with $INPUTA $INPUTB $L $M"
done
doing something with 2389906 install C,D,E,G 555
doing something with 4732435 delete C,D,G,H,I 804
$