[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: LYNX-DEV lynx and tables - t2thtml awk script
From: |
Tom Zerucha |
Subject: |
Re: LYNX-DEV lynx and tables - t2thtml awk script |
Date: |
Tue, 21 Jan 1997 17:29:19 -0500 (EST) |
The following awk script will make a passable table output in lynx,
at least on the Linux Console. Check the CONFIGURE stuff if you don't
want graphics characters or might want other options.
You need to pass the HTML through the script, e.g.
lynx -source wherever.com/whatever.html | t2thtml.awk >/tmp/lt.html ;
lynx /tmp/lt.html
(I may get a http proxy that will allow filters up when I get more
ambitious).
Forms almost look right, and may even work sometimes.
Bad things happen with
<form>...<TD>...</TD></FORM><TD>
since lynx likes to break lines at </form>.
Worse things happen with
</TR></FORM><FORM...><TR>
since I need to add a prefix tag store.
The t2tgrnest script is also available - an earlier one at my website, but
an updated one and t2thtml will be there when I get a change to post them.
address@hidden
finger address@hidden for PGP key
------------------------------------------------------
#!/usr/bin/gawk -f
# 1/21/97 first pass
# problems for Rowspan and many forms in tables
# they often look right, but wont all work
#look for CONFIGURE for changable parameters
####################
#This is to expand for integration into something like lynx
#The return value is the amount of spaces the markup takes
function vislength( istr ) {
str = istr;
gsub("[\200-\240]","",str);
gsub("\046amp;","\\&",str);
gsub("\046#169;","C",str);
gsub("\046reg;","R",str);
gsub("\046copy;","R",str);
gsub("\046#162;","c",str);
gsub("\046#160;"," ",str);
gsub("\046nbsp;"," ",str);
gsub("\046nbsp"," ",str);
str = toupper(str);
maxl=0;
while( match( str , "<SELECT" ) ) {
selrgn = substr( str, RSTART, 4096 );
point1 = RSTART;
match( selrgn, "</SELECT" );
point2 = RSTART;
selrgn = substr(selrgn, 0, RSTART+9 );
while( sub("> ", ">", selrgn ) );
while( sub(" <", "<", selrgn ) );
smaxl = 1;
while( match( selrgn , "<OPTION" ) ) {
selrgn = substr( selrgn , RSTART+7 , 4096 );
match( selrgn , ">" );
selrgn = substr( selrgn , RSTART+1 , 4096 );
if( match( selrgn , "<" ) ) {
if( RSTART > smaxl )
smaxl = RSTART;
}
else if( length( selrgn ) > smaxl )
smaxl = length( selrgn );
}
str = substr( str , 0 , point1 - 1 ) substr( str , point1 + point2 + 8 ,
4096 );
maxl += smaxl + 1;
}
while( length( str ) ) {
if( match( str , "<" ) )
maxl += RSTART-1;
else {
maxl += length( str );
return maxl;
}
if( match( str , ">" ) )
str = substr( str , RSTART+1 , 1024 );
}
return maxl;
}
####################
#append string to image with alignment and space padding
function alimg( wid ) {
wid -= vislength( imline );
if( !wid )
image[iline] = image[iline] imline vrule;
else if( substr(align[tnst,row,col],0,6) == "CENTER" ) {
if( wid % 2 ) {
imline = imline " ";
wid--;
}
wid = wid / 2;
image[iline] = image[iline] substr( spaces, 0, wid ) imline substr( spaces,
0, wid ) vrule;
}
else if( substr(align[tnst,row,col],0,5) == "RIGHT" )
image[iline] = image[iline] substr( spaces, 0, wid ) imline vrule;
else
image[iline] = image[iline] imline substr( spaces, 0, wid ) vrule;
}
####################
function dohline( cl , cm, cr ) {
image[iline] = "";
col = 1;
if( !rowlines || rsp[tnst,row,col] < 1 || iline == 0 )
image[iline] = cl;
else
image[iline] = vrule;
cnt = 0;
while( col < maxcol ) {
if( !rowlines || rsp[tnst,row,col] < 1 || iline == 0 ) {
image[iline] = image[iline] substr( hrule , 0, colwid[tnst,col] );
ccm = cm;
}
else {
image[iline] = image[iline] substr( spaces , 0, colwid[tnst,col] );
rsp[tnst,row,col] = 0;
ccm = cl;
}
# corrects line image for colspans in row (top or bottom, not all middle)
if( cnt ) {
cnt--;
image[iline] = image[iline] hrule1;
}
else if( tcs[tnst,row,col] <= 1 )
image[iline] = image[iline] ccm;
else {
cnt = tcs[tnst,row,col] - 2;
image[iline] = image[iline] hrule1;
}
col++;
}
image[iline] = image[iline] substr( hrule , 0, colwid[tnst,col] ) cr;
if( !sides ) #strip sidebars?
image[iline] = substr(image[iline],1,length( image[iline] )-1 );
iline++;
}
####################
#print a table into the image
function printtab() {
#CONFIGURE
#sides (vertical)
# 1, all nonest never
#+------+ +----+ ----
#|+---+ | |--- | ---
#||A|B|C| |A|BC| A|BC
#|+---+ | |--- | ---
#+------+ +----+ ----
sides = 1; #all
#sides = ( tnst == 1 ); #nonested
#sides = 0; #never
while( !colwid[tnst,maxcol] )
maxcol--;
row = 1;
iline = 0;
#top line
dohline( boxtl , boxt , boxtr );
#data rows
while( row < currow[tnst] ) {
#lines in row (valign not handled and rowspan ignored - all data is in topmost
row)
hght = 0;
while( hght < rowhght[tnst,row] ) {
image[iline] = vrule;
#each col in line
col = 1;
while( col <= maxcol ) {
imline = ttext[tnst,row,col,hght];
delete ttext[tnst,row,col,hght];
if( !tcs[tnst,row,col] ) #short or blank row
tcs[tnst,row,col] = maxcol - col + 1;
len = -1;
cnt = 0;
while( cnt < tcs[tnst,row,col] )
len += colwid[tnst,col+cnt++];
len += tcs[tnst,row,col];
alimg(len);
col += tcs[tnst,row,col];
}
hght++;
if( !sides ) #strip sidebars?
image[iline] = substr( image[iline],1,length( image[iline] ) - 1 );
iline++;
}
#bottom or interrow line
if( row + 1 == currow[tnst] )
dohline( boxbl , boxb , boxbr );
else if( rowlines )
dohline( boxlf , cross , boxrt );
row++;
}
iline--;
col = 1;
while( col <= maxcol )
delete colwid[tnst,col++];
delete tcs[tnst];
}
####################
#begin table data or header entry
function startentry() {
#missing </td>
if( tdflag[tnst] )
endentry();
curcol[tnst]++;
colsp[tnst] = 1;
while( rsp[tnst,currow[tnst]-1,curcol[tnst]] >= 1 )
curcol[tnst]++;
#grab alignment
align[tnst,currow[tnst],curcol[tnst]] = defalign[tnst];
if( substr(toupper($1)" ",0,3) == "TH " )
align[tnst,currow[tnst],curcol[tnst]] = "CENTER";
if( match(toupper($1), " ALIGN=") )
align[tnst,currow[tnst],curcol[tnst]] = toupper(substr($1,RSTART+7,6));
#grab colspan
if( match(toupper($1), "COLSPAN=") )
colsp[tnst] = int(substr($1,RSTART+8,5));
#grab rowspan;
rowsp = 1;
# if( match(toupper($1), "ROWSPAN=") )
# rowsp = substr($1,RSTART+8,5);
rowspan[curcol[tnst]] = int(rowsp);
while( rowsp ) {
rowsp--;
col1 = colsp[tnst];
tcs[tnst,currow[tnst]+rowsp,curcol[tnst]] = col1;
while( col1-- )
rsp[tnst,currow[tnst]+rowsp-1,curcol[tnst]+col] = rowsp;
ttext[tnst,currow[tnst]+rowsp,curcol[tnst],0] = "";
}
line[tnst] = 0;
tdflag[tnst] = 1;
}
####################
#correct column widths for longest text
function fixcolsp() {
col = int( ( col + colsp[tnst] - 1 ) / colsp[tnst] ) ;
col1 = 0;
while( col1 < colsp[tnst] ) {
if( col > colwid[tnst,curcol[tnst]+col1] )
colwid[tnst,curcol[tnst]+col1] = col;
col1++;
}
}
####################
function stripsd () {
while( sub(" $","",imline));
while( sub("^ ","",imline));
}
####################
#end table data or header entry
function endentry() {
if( colwid[tnst,curcol[tnst]] == 0 )
colwid[tnst,curcol[tnst]] = 1;
if( !colsp[tnst] )
colsp[tnst] = 1;
lastcol = 0;
lx = 0;
while( lx <= line[tnst] ) {
#trim edge spaces
imline = ttext[tnst,currow[tnst],curcol[tnst],lx];
stripsd();
col = vislength( imline );
if( !col ) {
imline = imline " ";
col = 1;
}
ttext[tnst,currow[tnst],curcol[tnst],lx] = imline;
#print tnst " row:" currow[tnst] " col:" curcol[tnst] " line:" lx " len:" col
">" ttext[tnst,currow[tnst],curcol[tnst],lx] "<";
if( col > lastcol ) {
fixcolsp();
lastcol = col;
}
lx++;
}
#remove trailing blank lines
while( lx > 1 && ttext[tnst,currow[tnst],curcol[tnst],lx-1] == " " )
lx--;
#FIXME need to spread rowhght among rowspan
if( lx > rowhght[tnst,currow[tnst]] )
rowhght[tnst,currow[tnst]] = lx;
while( colsp[tnst] > 1 ) {
# rowspan[curcol[tnst]+1] = rowspan[curcol[tnst]];
curcol[tnst]++;
tcs[tnst,currow[tnst],curcol[tnst]] = 0;
colsp[tnst]--;
}
tdflag[tnst] = 0;
line[tnst] = 0;
tralready = 0;
}
####################
#normalize structures in cases of omitted </td> or two few entries
function endrow () {
if( tdflag[tnst] )
endentry();
if( curcol[tnst] > maxcol )
maxcol = curcol[tnst];
curcol[tnst] = 0;
currow[tnst]++;
rowhght[tnst,currow[tnst]] = 1;
line[tnst] = 0;
tralready = 1;
}
####################
#begin table data row
function startrow () {
#omitted </tr>
if( tdflag[tnst] || !tralready )
endrow();
tralready = 0;
#valign?
defalign[tnst] = "default";
if( match(toupper($1), " ALIGN=") )
defalign[tnst] = toupper(substr($1,RSTART+7,6));
rowhght[tnst,currow[tnst]] = 1;
tcs[tnst,currow[tnst],1] = 0;
}
####################
#remove quotes around strings, i.e. "123" becomes 123
function stripit ( pval ) {
if( substr(pval,0,1) == "\"" ) {
pval = substr(pval,2,length(pval)-1);
if( match( pval , "\"" ) )
pval = substr(pval,0,RSTART-1);
}
else if( match(pval," ") )
pval = substr(pval,0,RSTART-1);
gsub( "\>" , "", pval );
return pval;
}
####################
# print text outside tables
function doxline() {
inrow = 0;
while( inrow <= line[0] ) {
imline = ttext[0,1,0,inrow++];
stripsd();
if( length( imline ) )
print imline;
}
delete ttext;
line[0] = 0;
}
####################
#main
#set some variables
BEGIN {
RS = "\<" ;
FS = "\>" ;
tnst = 0;
tdflag[0] = 0;
colsp[0] = 1;
tralready = 0;
line[0] = 0;
currow[0] = 1;
curcol[0] = 0;
#CONFIGURE - set to 1 for lines between rows
rowlines = 0;
#CONFIGURE - set to 1 for lines outside of tables
extlines = 1;
#CONFIGURE - set to 1 to stack all tables vertically
#will remove some lines outside of nested tables
vertstack = 0;
#CONFIGURE
#split if column would be wider than
maxwid = 32;
#split at the first space after backing up
splitat = 10;
#CONFIGURE
# PC Graphics characters (single);
boxtl = "\332"; boxt = "\302"; boxtr = "\277";
boxbl = "\300"; boxb = "\301"; boxbr = "\331";
vrule = "\263"; hrule = "\304";
boxlf = "\303"; boxrt = "\264"; cross = "\305";
# PC Graphics characters (double);
# boxtl = "\311"; boxt = "\313"; boxtr = "\273";
# boxbl = "\310"; boxb = "\312"; boxbr = "\274";
# vrule = "\272"; hrule = "\315"; #cross = "\316";
# Ascii boxes
# boxtl = "+"; boxt = "+"; boxtr = "+"; boxbl = "+"; boxb = "+"; boxbr = "+";
vrule = "|"; hrule = "-"; cross = "+";
hrule1 = hrule;
hrule = hrule hrule hrule hrule; #4
hrule = hrule hrule hrule hrule; #16
hrule = hrule hrule hrule hrule; #64
hrule = hrule hrule hrule hrule; #256
spaces = " ";
spaces = spaces spaces spaces spaces;
spaces = spaces spaces spaces spaces;
spaces = spaces spaces spaces spaces;
spaces = spaces spaces spaces spaces;
underl = "_";
underl = underl underl underl underl;
underl = underl underl underl underl;
underl = underl underl underl underl;
underl = underl underl underl underl;
}
#################### MAIN
{
if( substr(toupper($1)" ",0,6) == "TABLE " ) {
if( tnst > 0 ) {
if( !tdflag[tnst] ) {
startrow();
startentry();
}
}
#text outside tables
else if ( extlines )
doxline()
tnst++;
currow[tnst] = 1;
curcol[tnst] = 0;
line[tnst] = 0;
rowhght[tnst,1] = 1;
maxcol = 0;
tralready = 1;
}
if( tnst > 0 ) {
#begin/end markers
if( substr(toupper($1)" ",0,3) == "TD " \
|| substr(toupper($1)" ",0,3) == "TH " )
startentry();
else if( substr(toupper($1)" ",0,4) == "/TD " \
|| substr(toupper($1)" ",0,4) == "/TH " )
endentry();
else if( substr(toupper($1)" ",0,3) == "TR " )
startrow();
else if( substr(toupper($1)" ",0,4) == "/TR " )
endrow();
#END OF TABLE
else if( substr(toupper($1)" ",0,7) == "/TABLE " ) {
$1 = "";
if( !tralready )
endrow();
tralready = 0;
#generate image
print ttext[tnst,1,0,0];
printtab();
tnst--;
#go past a nonblank line
imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]];
stripsd();
ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = imline;
if( length( imline ) )
line[tnst]++;
#print or copy the rendered subtable
inrow = 0;
if( !tnst ) {
print "<PRE>";
while( inrow <= iline ) {
gsub( "_" , "" , image[inrow] );
print image[inrow++];
}
print "</PRE>";
line[tnst] = 0;
delete tcs;
delete colwid;
delete rsp;
system("");
}
else {
while( inrow <= iline ) {
ttext[tnst,currow[tnst],curcol[tnst],line[tnst]+inrow] = image[inrow];
inrow++;
}
col = length( image[inrow-1] );
colsp[tnst] = tcs[tnst,currow[tnst],curcol[tnst]];
fixcolsp();
line[tnst] += inrow;
if( line[tnst] > rowhght[tnst,currow[tnst]] )
rowhght[tnst,currow[tnst]] = line[tnst];
}
inrow = 0;
currow[tnst+1] = 0;
if( tnst && ( vertstack || !tdflag[tnst] ) ) {
endentry();
endrow();
}
tdflag[tnst] = 0;
}
}
#indicate options follow
#DEVEL
#mainly for forms - extract name in select or input
# if( match(toupper($1), "NAME=") ) {
# name = substr($1,RSTART+5,64);
# if( match(name," ") )
# name = substr(name,1,RSTART-1 );
# gsub( "\>" , "", name );
# name = stripit( name );
# $2 = name "=" $2;
# }
#fix character formats
if( !length( $2 ) )
$2 = " ";
if( NF > 1 && length($2) ) {
gsub("\n","",$2);
gsub("\r","",$2);
gsub("\t"," ",$2);
gsub(" *"," ",$2);
if( !tdflag[tnst] )
sub(" $","",$2);
}
#append tags
if( tnst ) {
imline = toupper($1) " ";
if( \
substr(imline,0,5) == "FORM " || \
substr(imline,0,6) == "INPUT " || \
substr(imline,0,7) == "OPTION " || \
substr(imline,0,7) == "SELECT " || \
substr(imline,0,6) == "/FORM " || \
substr(imline,0,8) == "/SELECT " || \
substr(imline,0,4) == "IMG " || \
substr(imline,0,2) == "A " \
) {
if( substr(imline,0,2) == "A " && match( toupper(imline) , "HREF" ) ) {
imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] "<" $1
">[LINK]</a>" $2;
ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = imline;
line[tnst]++;
ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = "";
$1 = "";
$2 = "";
}
else {
imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] "<" $1 ">";
ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = imline;
}
}
#line breaking entries
imline = toupper($1) " ";
if( imline == "BR " || \
substr(imline,0,3) == "HR " || \
substr(imline,0,3) == "LI " || \
substr(imline,0,2) == "P " \
) {
# substr(imline,0,7) == "OPTION " || \
# substr(imline,0,8) == "/SELECT " || \
#ignore for blank lines
imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]];
stripsd();
ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = imline;
if( length( imline ) ) {
line[tnst]++;
ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = "";
}
}
#form input fields
if( substr(imline,0,6) == "INPUT " ) {
size = 0;
if( match(toupper($1), "SIZE=") ) {
size = substr($1,RSTART+5,64);
size = stripit( size );
}
value = "";
if( match(toupper($1), "VALUE=") ) {
value = substr($1,RSTART+6,256);
value = stripit( value );
while( sub(" $","",value));
while( sub("^ ","",value));
}
if( match(toupper($1), "TYPE=") ) {
type = substr(imline,RSTART+5,256);
type = stripit( type );
if( toupper(type) == "CHECKBOX" )
value = "___"
if( toupper(type) == "RADIO" )
value = "___"
}
if( length( value ) < size )
value = value substr( underl , 0 , size - length(value) );
value = substr( underl , 0 , length( value ) + 1 );
if( !match(toupper(imline), "TYPE=.?HIDDEN") )
ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] \
= ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] value ;
}
#extract ALT string
if( match(substr(imline,0,4), "IMG ") ) {
sub( "[Aa][Ll][Tt]", "ALT" , $1 );
sub( "ALT =", "ALT=" , $1 );
sub( "ALT= ", "ALT=" , $1 );
if( match(toupper($1)," ALT=")) {
name = substr($1,RSTART+5,length($1)-7);
name = stripit( name )
$2 = substr( underl , 0 , length( name ) + 2 ) $2;
}
else
$2 = "__________" $2;
imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] $2;
ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = imline;
line[tnst]++;
ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = "";
}
else {
#append line
imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] $2;
ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = imline;
}
# if( !tdflag[tnst] )
# pre[tnst,currow[tnst]] = pre[tnst,currow[tnst]] imline;
if( length($2) ) {
#Split long lines - not fully tested
while( !match( imline , "<" ) && vislength( imline ) > maxwid*colsp[tnst]
) {
temp = 1;
stspl = maxwid*colsp[tnst] - splitat * temp;
while( stspl > 0 ) {
if( match( substr( imline, stspl , 1024)," ") ) {
ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = substr(imline,
0, stspl + RSTART - 1);
ttext[tnst,currow[tnst],curcol[tnst],line[tnst] + 1] =
substr(imline, stspl + RSTART - 1, 1024);
stspl = -1;
}
else
stspl = maxwid*colsp[tnst] - splitat * ++temp;
}
line[tnst]++;
imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]];
}
}
}
else {
print "<" $1 ">" $2;
# imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] "<" $1 ">" $2;
# ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = imline;
}
#print tnst","currow[tnst]","curcol[tnst]","line[tnst] "<" $1 ">" $2 ":"
ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] ":";
}
END {
if( extlines )
doxline();
}
;
; To UNSUBSCRIBE: Send a mail message to address@hidden
; with "unsubscribe lynx-dev" (without the
; quotation marks) on a line by itself.
;