Code:
$ cat xmlt.awk
BEGIN {
DEP=4; # How many tags out to keep data
POS=0 # Position in tag stack
RS="<"; # Input record separator
FS="[ \n\t\t>/]"; # Input field separator
# Hardcode the first two things in the output order
ORDER[++O]="XN:MECONTEXT:ID";
ORDER["XN:MECONTEXT:ID"]=O
ORDER[++O]="XN:VSDATACONTAINER:ID";
ORDER["XN:VSDATACONTAINER:ID"]=O
}
# This function is checked on whether a property should be added to
# the list of what to print.
function catchthis(PROPNAME, PROPVAL) {
# Catch all CDATA elements inside XN:VSDATACONTAINER tags
if(TSS ~ /XN:VSDATACONTAINER/) return(PROPNAME ~ /DATA/);
return((TSS ~ /XN:VSDATACONTAINER/) && (PROPNAME ~ /CDATA/));
}
# Always this finicky case when RS isn't \n
(NR==1) && (length($0) == 0) { next }
# Skip XML comments
/^!--/ {
while(!(I=index($0, "-->"))) if(getline <= 0) exit;
# Strip out comment
$0="--XMLCOMMENT-- />"substr($0,I+3);
}
# Ignore XML specification junk
/^\?/ || /^\!/ { next }
# These should be special variables for match() but aren't.
# String before match
function rbefore(STR) { return(substr(STR, N, RSTART-1)); }
# First char of match
function rmid(STR) { return(substr(STR, RSTART, 1)); }
# Entire match
function rall(STR) { return(substr(STR, RSTART, RLENGTH)); }
# String after match
function rafter(STR) { return(substr(STR, RSTART+RLENGTH)); }
# Turns Q SUBSEP R into A[PFIX":"Q]=R
function aquote(OUT, A, PFIX, TA) {
if(OUT)
{
if(PFIX) PFIX=PFIX":"
split(OUT, TA, SUBSEP);
A[toupper(PFIX) toupper(TA[1])]=TA[2];
}
return("");
}
# Intended to be less stupid about quoted text in XML/HTML.
# Splits a='b' c='d' e='f' into A[PFIX":"a]=b, A[PFIX":"c]=d, etc.
function qsplit(STR, A, PFIX, X, OUT) {
while(STR && match(STR, /([ \n\t]+)|[\x27\x22=]/))
{
OUT = OUT rbefore(STR);
RMID=rmid(STR);
if((RMID == "'") || (RMID == "\"")) # Quote characters
{
if(!Q) Q=RMID; # Begin quote section
else if(Q == RMID) Q=""; # End quote section
else OUT = OUT RMID; # Quoted quote
} else if(RMID == "=") {
if(Q) OUT=OUT RMID; else OUT=OUT SUBSEP;
} else if((RMID=="\r")||(RMID=="\n")||(RMID=="\t")||(RMID==" ")) {
if(Q) OUT = OUT rall(STR); # Literal quoted whitespace
else OUT = aquote(OUT, A, PFIX); # Unquoted WS, next block
}
STR=rafter(STR); # Strip off the text we've processed already.
}
aquote(OUT STR, A, PFIX); # Process any text we haven't already.
}
# Call before increment
function addprop(AIN,X,S) {
for(X in AIN)
{
if(!(X in ORDER))
if(catchthis(X, AIN[X]))
{
ORDER[++O]=X
ORDER[X]=O
}
PROP[X]=AIN[X]
KEEP[X]=(POS+2)-DEP
}
}
# Call before decrement
function delprop(TA, N, M,X) {
for(X in KEEP)
if(KEEP[X] > POS)
{
delete PROP[X];
delete KEEP[X];
}
}
# Non-close tag
!/^\// {
TAG=$1; sub(/^[^ \r\n\t>\/]*/, "");
match($0, /\/?>/);
TDATA=rbefore($0); CDATA=rafter($0);
# Flatten and strip whitespace
gsub(/[ \r\n\t]+/, " ", CDATA);
gsub(/^[ \r\n\t]+/, "", CDATA); gsub(/[ \r\n\t]+$/, "", CDATA);
for(X in TA) delete TA[X];
qsplit(TDATA, TA, TAG);
if(length(CDATA))
TA[toupper(TAG)":""CDATA"]=CDATA
addprop(TA);
if(RLENGTH != 2) # Found > instead of self-closing />
{
TS[++POS]=toupper(TAG);
TSS=TSS"/"toupper(TAG);
}
# for(X in A) printf("%s[%s]=%s\n", TAG, X, A[X]);
}
# Close tags
/^\// {
for(TPOS=POS; (TPOS>0) && (toupper($2) != TS[TPOS]); TPOS--);
if(toupper($2) == "XN:VSDATACONTAINER")
{
OUT=""
PFIX=""
for(N=1; N<=O; N++)
{
if(!PROP[ORDER[N]]) PROP[ORDER[N]]="!"ORDER[N]
OUT=OUT PFIX PROP[ORDER[N]];
PFIX=OFS
}
print OUT;
}
if(TPOS <= 0) print "Went under for "$2" pos="POS
else
{
TPOS--;
while(TPOS < POS)
{
delprop();
sub(/\/[^\/]*$/, "", TSS); POS--;
}
}
}
$ awk -f xmlt.awk OFS="\t" Enodeb_MO_Export_10_47.xml
CCL01736 1 vsDataENodeBFunction EricssonSpecificAttributes.13.25CCL01736 SubNetwork=ONRM_ROOT_MO_R,MeContext=CCL01736,ManagedElement=1,vsDataTransportNetwork=1,vsDataSctp=1 0 32 1 0 310 410 3 30 1440 30 true SubNetwork=ONRM_ROOT_MO_R,MeContext=CCL01736,ManagedElement=1,vsDataIpSystem=1,vsDataIpAccessHostEt=1 false 1true true 1 -2000000000 -2000000000 -2000000000 -2000000000 100 true 0 false