lynx-dev
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: LYNX-DEV lynx and tables - t2thtml awk script


From: Tom Zerucha
Subject: Re: LYNX-DEV lynx and tables - t2thtml awk script
Date: Tue, 21 Jan 1997 17:29:19 -0500 (EST)

The following awk script will make a passable table output in lynx,
at least on the Linux Console.  Check the CONFIGURE stuff if you don't
want graphics characters or might want other options.

You need to pass the HTML through the script, e.g.
lynx -source wherever.com/whatever.html | t2thtml.awk >/tmp/lt.html ;
lynx /tmp/lt.html

(I may get a http proxy that will allow filters up when I get more
ambitious).

Forms almost look right, and may even work sometimes.

Bad things happen with
<form>...<TD>...</TD></FORM><TD>
since lynx likes to break lines at </form>.

Worse things happen with
</TR></FORM><FORM...><TR>
since I need to add a prefix tag store.

The t2tgrnest script is also available - an earlier one at my website, but
an updated one and t2thtml will be there when I get a change to post them.

address@hidden
finger address@hidden for PGP key

------------------------------------------------------

#!/usr/bin/gawk -f

# 1/21/97 first pass
# problems for Rowspan and many forms in tables
# they often look right, but wont all work

#look for CONFIGURE for changable parameters

####################
#This is to expand for integration into something like lynx
#The return value is the amount of spaces the markup takes

function vislength( istr ) {

  str = istr;

  gsub("[\200-\240]","",str);

  gsub("\046amp;","\\&",str);
  gsub("\046#169;","C",str);
  gsub("\046reg;","R",str);
  gsub("\046copy;","R",str);
  gsub("\046#162;","c",str);
  gsub("\046#160;"," ",str);
  gsub("\046nbsp;"," ",str);
  gsub("\046nbsp"," ",str);

  str = toupper(str);
  maxl=0;
  while( match( str , "<SELECT" ) ) {

    selrgn = substr( str, RSTART, 4096 );
    point1 = RSTART;
    match( selrgn, "</SELECT" );
    point2 = RSTART;
    selrgn = substr(selrgn, 0, RSTART+9 );

    while( sub("> ", ">", selrgn ) );
    while( sub(" <", "<", selrgn ) );
    smaxl = 1;
    while( match( selrgn , "<OPTION" ) ) {
      selrgn = substr( selrgn , RSTART+7 , 4096 );
      match( selrgn , ">" );
      selrgn = substr( selrgn , RSTART+1 , 4096 );
      if( match( selrgn , "<" ) ) {
        if( RSTART > smaxl )
          smaxl = RSTART;
      }
      else if( length( selrgn ) > smaxl )
        smaxl = length( selrgn );
    }
    str = substr( str , 0 , point1 - 1 ) substr( str , point1 + point2 + 8 , 
4096 );
    maxl += smaxl + 1;
  }

  while( length( str ) ) {
    if( match( str , "<" ) )
      maxl += RSTART-1;
    else {
      maxl += length( str );
      return maxl;
    }

    if( match( str , ">" ) )
      str = substr( str , RSTART+1 , 1024 );
  }
  return maxl;
}

####################
#append string to image with alignment and space padding
function alimg( wid ) {

  wid -= vislength( imline );

  if( !wid )
    image[iline] = image[iline] imline vrule;
  else if( substr(align[tnst,row,col],0,6) == "CENTER" ) {
    if( wid  % 2 ) {
      imline = imline " ";
      wid--;
    }
    wid = wid / 2;
    image[iline] = image[iline] substr( spaces, 0, wid ) imline substr( spaces, 
0, wid ) vrule;
  }
  else if( substr(align[tnst,row,col],0,5) == "RIGHT" )
    image[iline] = image[iline] substr( spaces, 0, wid ) imline vrule;
  else
    image[iline] = image[iline] imline substr( spaces, 0, wid ) vrule;

}

####################
function dohline( cl , cm, cr ) {

  image[iline] = "";
  col = 1;

  if( !rowlines || rsp[tnst,row,col] < 1 || iline == 0 )
    image[iline] = cl;
  else
    image[iline] = vrule;

  cnt = 0;
  while( col < maxcol ) {

    if( !rowlines || rsp[tnst,row,col] < 1 || iline == 0 ) {
      image[iline] = image[iline] substr( hrule , 0, colwid[tnst,col] );
      ccm = cm;
    }
    else {
      image[iline] = image[iline] substr( spaces , 0, colwid[tnst,col] );
      rsp[tnst,row,col] = 0;
      ccm = cl;
    }
# corrects line image for colspans in row (top or bottom, not all middle)
    if( cnt ) {
      cnt--;
      image[iline] = image[iline] hrule1;
    }
    else if( tcs[tnst,row,col] <= 1 )
      image[iline] = image[iline] ccm;
    else {
      cnt = tcs[tnst,row,col] - 2;
      image[iline] = image[iline] hrule1;
    }
    col++;
  }
  image[iline] = image[iline] substr( hrule , 0, colwid[tnst,col] ) cr;

  if( !sides ) #strip sidebars?
    image[iline] = substr(image[iline],1,length( image[iline] )-1 );

  iline++;
}

####################
#print a table into the image
function printtab()  {

#CONFIGURE

#sides (vertical)
# 1, all  nonest never
#+------+ +----+ ----
#|+---+ | |--- | ---
#||A|B|C| |A|BC| A|BC
#|+---+ | |--- | ---
#+------+ +----+ ----
sides = 1; #all
#sides = ( tnst == 1 ); #nonested
#sides = 0; #never

  while( !colwid[tnst,maxcol] )
    maxcol--;
  row = 1;
  iline = 0;
#top line
  dohline( boxtl , boxt , boxtr );

#data rows
  while( row < currow[tnst] ) {

#lines in row (valign not handled and rowspan ignored - all data is in topmost 
row)
    hght = 0;
    while( hght < rowhght[tnst,row] ) {
      image[iline] = vrule;
#each col in line
      col = 1;
      while( col <= maxcol ) {
        imline = ttext[tnst,row,col,hght];
        delete ttext[tnst,row,col,hght];
        if( !tcs[tnst,row,col] ) #short or blank row
          tcs[tnst,row,col] = maxcol - col + 1;
        len = -1;
        cnt = 0;
        while( cnt < tcs[tnst,row,col] )
          len += colwid[tnst,col+cnt++];
        len += tcs[tnst,row,col];
        alimg(len);
        col += tcs[tnst,row,col];
      }
      hght++;
      if( !sides ) #strip sidebars?
        image[iline] = substr( image[iline],1,length( image[iline] ) - 1 );
      iline++;
    }
#bottom or interrow line
    if( row + 1 == currow[tnst] )
      dohline( boxbl , boxb , boxbr );
    else if( rowlines )
      dohline( boxlf , cross , boxrt );
    row++;
  }
  iline--;
  col = 1;
  while( col <= maxcol )
    delete colwid[tnst,col++];
  delete tcs[tnst];
}

####################
#begin table data or header entry
function startentry() {

#missing </td>
  if( tdflag[tnst] )
    endentry();

  curcol[tnst]++;
  colsp[tnst] = 1;

  while( rsp[tnst,currow[tnst]-1,curcol[tnst]] >= 1 )
    curcol[tnst]++;

#grab alignment
  align[tnst,currow[tnst],curcol[tnst]] = defalign[tnst];
  if( substr(toupper($1)" ",0,3) == "TH " )
    align[tnst,currow[tnst],curcol[tnst]] = "CENTER";
  if( match(toupper($1), " ALIGN=") )
    align[tnst,currow[tnst],curcol[tnst]] = toupper(substr($1,RSTART+7,6));

#grab colspan
  if( match(toupper($1), "COLSPAN=") )
    colsp[tnst] = int(substr($1,RSTART+8,5));

#grab rowspan;
  rowsp = 1;
#  if( match(toupper($1), "ROWSPAN=") )
#    rowsp = substr($1,RSTART+8,5); 
  rowspan[curcol[tnst]] = int(rowsp);

  while( rowsp ) {
    rowsp--;
    col1 = colsp[tnst];
    tcs[tnst,currow[tnst]+rowsp,curcol[tnst]] = col1;
    while( col1-- )
      rsp[tnst,currow[tnst]+rowsp-1,curcol[tnst]+col] = rowsp;
    ttext[tnst,currow[tnst]+rowsp,curcol[tnst],0] = "";
  }

  line[tnst] = 0;
  tdflag[tnst] = 1;
}

####################
#correct column widths for longest text
function fixcolsp() {
  col = int( ( col + colsp[tnst] - 1 ) / colsp[tnst] ) ;
  col1 = 0;
  while( col1 < colsp[tnst] ) {
    if( col > colwid[tnst,curcol[tnst]+col1] )
      colwid[tnst,curcol[tnst]+col1] = col;
    col1++;
  }
}

####################
function stripsd () {
  while( sub(" $","",imline));
  while( sub("^ ","",imline));
}

####################
#end table data or header entry
function endentry() {

  if( colwid[tnst,curcol[tnst]] == 0 )
    colwid[tnst,curcol[tnst]] = 1;
  if( !colsp[tnst] )
    colsp[tnst] = 1;

  lastcol = 0;
  lx = 0;
  while( lx <= line[tnst] ) {

#trim edge spaces
    imline = ttext[tnst,currow[tnst],curcol[tnst],lx];
    stripsd();
    col = vislength( imline );
    if( !col ) {
      imline = imline " ";
      col = 1;
    }
    ttext[tnst,currow[tnst],curcol[tnst],lx] = imline;

#print tnst " row:" currow[tnst] " col:" curcol[tnst] " line:" lx " len:" col 
">" ttext[tnst,currow[tnst],curcol[tnst],lx] "<";
    if( col > lastcol ) {
      fixcolsp();
      lastcol = col;
    }
    lx++;
  }
#remove trailing blank lines
  while( lx > 1 && ttext[tnst,currow[tnst],curcol[tnst],lx-1] == " " )
    lx--;

#FIXME need to spread rowhght among rowspan
  if( lx > rowhght[tnst,currow[tnst]] )
    rowhght[tnst,currow[tnst]] = lx;

  while( colsp[tnst] > 1 ) {
#    rowspan[curcol[tnst]+1] = rowspan[curcol[tnst]];
    curcol[tnst]++;
    tcs[tnst,currow[tnst],curcol[tnst]] = 0;
    colsp[tnst]--;
  }

  tdflag[tnst] = 0;
  line[tnst] = 0;
  tralready = 0;
}

####################
#normalize structures in cases of omitted </td> or two few entries
function endrow () {
  if( tdflag[tnst] )
    endentry();
  if( curcol[tnst] > maxcol )
    maxcol = curcol[tnst];
  curcol[tnst] = 0;
  currow[tnst]++;
  rowhght[tnst,currow[tnst]] = 1;
  line[tnst] = 0;
  tralready = 1;
}

####################
#begin table data row
function startrow () {
#omitted </tr>
  if( tdflag[tnst] || !tralready )
    endrow();
  tralready = 0;
#valign?
  defalign[tnst] = "default";
  if( match(toupper($1), " ALIGN=") )
    defalign[tnst] = toupper(substr($1,RSTART+7,6));
  rowhght[tnst,currow[tnst]] = 1;
  tcs[tnst,currow[tnst],1] = 0;
}

####################
#remove quotes around strings, i.e. "123" becomes 123
function stripit ( pval ) {
  if( substr(pval,0,1) == "\"" ) {
    pval = substr(pval,2,length(pval)-1);
    if( match( pval , "\"" ) )
      pval = substr(pval,0,RSTART-1);
  }
  else if( match(pval," ") )
    pval = substr(pval,0,RSTART-1);
  gsub( "\>" , "", pval );
  return pval;
}

####################
# print text outside tables
function doxline() {
  inrow = 0;
  while( inrow <= line[0] ) {
    imline = ttext[0,1,0,inrow++];
    stripsd();
    if( length( imline ) )
      print imline;
  }
  delete ttext;
  line[0] = 0;
}

####################
#main

#set some variables
BEGIN { 
  RS = "\<" ; 
  FS = "\>" ; 

  tnst = 0;
  tdflag[0] = 0;
  colsp[0] = 1;

  tralready = 0;
  line[0] = 0;
  currow[0] = 1;
  curcol[0] = 0;

#CONFIGURE - set to 1 for lines between rows
  rowlines = 0;

#CONFIGURE - set to 1 for lines outside of tables
  extlines = 1;

#CONFIGURE - set to 1 to stack all tables vertically
#will remove some lines outside of nested tables
  vertstack = 0;

#CONFIGURE
#split if column would be wider than
  maxwid = 32;
#split at the first space after backing up
  splitat = 10;

#CONFIGURE

# PC Graphics characters (single);
  boxtl = "\332";  boxt = "\302";  boxtr = "\277";
  boxbl = "\300";  boxb = "\301";  boxbr = "\331";
  vrule = "\263";  hrule = "\304";
  boxlf = "\303";  boxrt = "\264"; cross = "\305";

# PC Graphics characters (double);
#  boxtl = "\311";  boxt = "\313";  boxtr = "\273";
#  boxbl = "\310";  boxb = "\312";  boxbr = "\274";
#  vrule = "\272";  hrule = "\315";  #cross = "\316";

# Ascii boxes
# boxtl = "+"; boxt = "+"; boxtr = "+"; boxbl = "+"; boxb = "+"; boxbr = "+"; 
vrule = "|"; hrule = "-"; cross = "+";

  hrule1 = hrule;

  hrule = hrule hrule hrule hrule; #4
  hrule = hrule hrule hrule hrule; #16
  hrule = hrule hrule hrule hrule; #64
  hrule = hrule hrule hrule hrule; #256

  spaces = " ";
  spaces = spaces spaces spaces spaces;
  spaces = spaces spaces spaces spaces;
  spaces = spaces spaces spaces spaces;
  spaces = spaces spaces spaces spaces;

  underl = "_";
  underl = underl underl underl underl;
  underl = underl underl underl underl;
  underl = underl underl underl underl;
  underl = underl underl underl underl;

}

#################### MAIN
{
  if( substr(toupper($1)" ",0,6) == "TABLE " ) {

    if( tnst > 0 ) {
      if( !tdflag[tnst] ) {
        startrow();
        startentry();
      }
    }
#text outside tables
    else if ( extlines )
      doxline()

    tnst++;
    currow[tnst] = 1;
    curcol[tnst] = 0;
    line[tnst] = 0;
    rowhght[tnst,1] = 1;
    maxcol = 0;
    tralready = 1;
  }

  if( tnst > 0 ) {
#begin/end markers
    if( substr(toupper($1)" ",0,3) == "TD " \
        || substr(toupper($1)" ",0,3) == "TH " )
      startentry();
    else if( substr(toupper($1)" ",0,4) == "/TD " \
        || substr(toupper($1)" ",0,4) == "/TH " )
      endentry();
    else if( substr(toupper($1)" ",0,3) == "TR " )
      startrow();
    else if( substr(toupper($1)" ",0,4) == "/TR " )
      endrow();
#END OF TABLE
    else if( substr(toupper($1)" ",0,7) == "/TABLE " ) {
      $1 = "";
      if( !tralready )
        endrow();
      tralready = 0;

#generate image
      print ttext[tnst,1,0,0];
      printtab();
      tnst--;

#go past a nonblank line      
      imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]];
      stripsd();
      ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = imline;
      if( length( imline ) )
        line[tnst]++;

#print or copy the rendered subtable
      inrow = 0;
      if( !tnst ) {
        print "<PRE>";
        while( inrow <= iline ) {
          gsub( "_" , "" , image[inrow] );
          print image[inrow++];
        }
        print "</PRE>";
        line[tnst] = 0;
        delete tcs;
        delete colwid;
        delete rsp;
        system("");
      }
      else {
        while( inrow <= iline ) {
          ttext[tnst,currow[tnst],curcol[tnst],line[tnst]+inrow] = image[inrow];
          inrow++;
        }
        col = length( image[inrow-1] );
        colsp[tnst] = tcs[tnst,currow[tnst],curcol[tnst]];
        fixcolsp();
        line[tnst] += inrow;
        if( line[tnst] > rowhght[tnst,currow[tnst]] )
          rowhght[tnst,currow[tnst]] = line[tnst];
      }
      inrow = 0;
      currow[tnst+1] = 0;
      if( tnst && ( vertstack || !tdflag[tnst] ) ) {
        endentry();
        endrow();
      }
      tdflag[tnst] = 0;
    }    
  }

#indicate options follow

#DEVEL
#mainly for forms - extract name in select or input
#    if( match(toupper($1), "NAME=") ) {
#       name = substr($1,RSTART+5,64);
#       if( match(name," ") )
#         name = substr(name,1,RSTART-1 );
#       gsub( "\>" , "", name );
#       name = stripit( name );
#       $2 = name "=" $2;
#    }


#fix character formats
  if( !length( $2 ) )
    $2 = " ";

  if( NF > 1 && length($2) ) {
    gsub("\n","",$2);
    gsub("\r","",$2);
    gsub("\t"," ",$2);
    gsub("  *"," ",$2);
    if( !tdflag[tnst] )
      sub(" $","",$2);
  }

#append tags
  if( tnst ) {
    imline = toupper($1) " ";
    if( \
        substr(imline,0,5) == "FORM " || \
        substr(imline,0,6) == "INPUT " || \
        substr(imline,0,7) == "OPTION " || \
        substr(imline,0,7) == "SELECT " || \
        substr(imline,0,6) == "/FORM " || \
        substr(imline,0,8) == "/SELECT " || \
        substr(imline,0,4) == "IMG " || \
        substr(imline,0,2) == "A " \
          ) {

      if( substr(imline,0,2) == "A " && match( toupper(imline) , "HREF" ) ) {
        imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] "<" $1 
">[LINK]</a>" $2;
        ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = imline;
        line[tnst]++;
        ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = "";
        $1 = "";
        $2 = "";
      }
      else {
        imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] "<" $1 ">";
        ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = imline;
      }
    }

#line breaking entries
    imline = toupper($1) " ";
    if( imline == "BR " || \
        substr(imline,0,3) == "HR " || \
        substr(imline,0,3) == "LI " || \
        substr(imline,0,2) == "P " \
        ) {

#       substr(imline,0,7) == "OPTION " || \
#       substr(imline,0,8) == "/SELECT " || \

#ignore for blank lines
      imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]];
      stripsd();
      ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = imline;
      if( length( imline ) ) {
        line[tnst]++;
        ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = "";
      }
    }

#form input fields
    if( substr(imline,0,6) == "INPUT " ) {
      size = 0;
      if( match(toupper($1), "SIZE=") ) {
          size = substr($1,RSTART+5,64);
          size = stripit( size );
      }
      value = "";
      if( match(toupper($1), "VALUE=") ) {
          value = substr($1,RSTART+6,256);
          value = stripit( value );
          while( sub(" $","",value));
          while( sub("^ ","",value));
      }
      if( match(toupper($1), "TYPE=") ) {
          type = substr(imline,RSTART+5,256);
          type = stripit( type );
          if( toupper(type) == "CHECKBOX" )
            value = "___"
          if( toupper(type) == "RADIO" )
            value = "___"
      }
      if( length( value ) < size )
        value = value substr( underl , 0 , size - length(value) );
          
      value = substr( underl , 0 , length( value ) + 1 );

      if( !match(toupper(imline), "TYPE=.?HIDDEN") )
        ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] \
          = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] value ;

    }

#extract ALT string
    if( match(substr(imline,0,4), "IMG ") ) {
      sub( "[Aa][Ll][Tt]", "ALT" , $1 );
      sub( "ALT =", "ALT=" , $1 );
      sub( "ALT= ", "ALT=" , $1 );
      if( match(toupper($1)," ALT=")) {
        name = substr($1,RSTART+5,length($1)-7);
        name = stripit( name )
        $2 = substr( underl , 0 , length( name ) + 2 ) $2;
      }
      else
        $2 = "__________" $2;

      imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] $2;
      ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = imline;
      line[tnst]++;
      ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = "";

    }
    else {
#append line
      imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] $2;
      ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = imline;
    }

#       if( !tdflag[tnst] )
#         pre[tnst,currow[tnst]] = pre[tnst,currow[tnst]] imline;

    if( length($2) ) {
#Split long lines - not fully tested

      while( !match( imline , "<" ) && vislength( imline ) > maxwid*colsp[tnst] 
) {
        temp = 1;
        stspl = maxwid*colsp[tnst] - splitat * temp;
        while( stspl > 0 ) {
          if( match( substr( imline, stspl , 1024)," ") ) {
            ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = substr(imline, 
0, stspl + RSTART - 1);
            ttext[tnst,currow[tnst],curcol[tnst],line[tnst] + 1] = 
substr(imline, stspl + RSTART - 1, 1024);
            stspl = -1;
          }
          else
            stspl = maxwid*colsp[tnst] - splitat * ++temp;
        }
        line[tnst]++;
        imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]];
      }
    }

  }
  else {
    print "<" $1 ">" $2;
#    imline = ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] "<" $1 ">" $2;
#    ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] = imline;
  }


#print tnst","currow[tnst]","curcol[tnst]","line[tnst] "<" $1 ">" $2 ":" 
ttext[tnst,currow[tnst],curcol[tnst],line[tnst]] ":";
}

END { 
if( extlines )
     doxline();
}

;
; To UNSUBSCRIBE:  Send a mail message to address@hidden
;                  with "unsubscribe lynx-dev" (without the
;                  quotation marks) on a line by itself.
;

reply via email to

[Prev in Thread] Current Thread [Next in Thread]