freetype-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Git][freetype/freetype][master] Avoid n^2 scanning for binary data.


From: Werner Lemberg (@wl)
Subject: [Git][freetype/freetype][master] Avoid n^2 scanning for binary data.
Date: Fri, 08 Mar 2024 16:49:06 +0000

Werner Lemberg pushed to branch master at FreeType / FreeType

Commits:

  • 17545d4b
    by Ben Wagner at 2024-03-08T17:47:43+01:00
    Avoid n^2 scanning for binary data.
    
    When creating a CID parser the location of the 'StartData' or '/sfnts'
    tokens needs to be known.  However, the token parser requires that the
    entire document be in memory and flattening the entire stream into memory is
    to be avoided.
    
    To avoid forcing the entire stream into memory, previously this code would
    scan through the stream looking for 'StartData' or '/sfnts' as strings.
    However, these strings could have been in a comment or string token, so the
    stream would be read into memory up to that point and the parser run to
    check that these strings were actually tokens.  This forced a parser restart
    from the beginning each time; as a result, data with many 'StartData'
    non-tokens would take n^2 time to check.
    
    * src/cid/cidparse.c (cid_parser_new): Change algorithm to make the initial
    scan look for the last possible 'StartData' or '/sfnts' string in the
    stream.  The stream is read forward instead of backward as a typical normal
    CID font will have one 'StartData' toward the beginning of the data and it
    it much faster to read the data from beginning to end instead of end to
    beginning.  For memory-based fonts the limit is set to the end of the stream
    since the stream is already in memory.  Then the parser is run once to look
    for 'StartData' or '/sfnts' tokens.  If they are found the parser is re-set
    to reflect this new information.
    
    Reported as
    
      https://issues.chromium.org/issues/40201695
    

1 changed file:

Changes:

  • src/cid/cidparse.c
    ... ... @@ -90,10 +90,15 @@
    90 90
         if ( error )
    
    91 91
           goto Exit;
    
    92 92
     
    
    93
    -  Again:
    
    94
    -    /* now, read the rest of the file until we find */
    
    95
    -    /* `StartData' or `/sfnts'                      */
    
    93
    +    if ( !stream->read ) {
    
    94
    +      /* just parse memory-based streams */
    
    95
    +      offset = stream->size;
    
    96
    +    }
    
    97
    +    else
    
    96 98
         {
    
    99
    +      /* Find the last `StartData` or `/sfnts`.  The parser requires */
    
    100
    +      /* contiguous memory; attempt to pin as little as necessary.   */
    
    101
    +
    
    97 102
           /*
    
    98 103
            * The algorithm is as follows (omitting the case with less than 256
    
    99 104
            * bytes to fill for simplicity).
    
    ... ... @@ -119,7 +124,8 @@
    119 124
           FT_Byte*  p           = buffer;
    
    120 125
     
    
    121 126
     
    
    122
    -      for ( offset = FT_STREAM_POS(); ; offset += 256 )
    
    127
    +      offset = 0;
    
    128
    +      while ( 1 )
    
    123 129
           {
    
    124 130
             FT_ULong  stream_len;
    
    125 131
     
    
    ... ... @@ -127,7 +133,7 @@
    127 133
             stream_len = stream->size - FT_STREAM_POS();
    
    128 134
     
    
    129 135
             read_len = FT_MIN( read_len, stream_len );
    
    130
    -        if ( FT_STREAM_READ( p, read_len ) )
    
    136
    +        if ( read_len && FT_STREAM_READ( p, read_len ) )
    
    131 137
               goto Exit;
    
    132 138
     
    
    133 139
             /* ensure that we do not compare with data beyond the buffer */
    
    ... ... @@ -141,20 +147,23 @@
    141 147
                    ft_strncmp( (char*)p, STARTDATA, STARTDATA_LEN ) == 0 )
    
    142 148
               {
    
    143 149
                 /* save offset of binary data after `StartData' */
    
    144
    -            offset += (FT_ULong)( p - buffer ) + STARTDATA_LEN + 1;
    
    145
    -            goto Found;
    
    150
    +            offset = FT_STREAM_POS() - read_len - read_offset
    
    151
    +                     + (FT_ULong)( p - buffer ) + STARTDATA_LEN + 1;
    
    146 152
               }
    
    147 153
               else if ( p[1] == 's'                                   &&
    
    148 154
                         ft_strncmp( (char*)p, SFNTS, SFNTS_LEN ) == 0 )
    
    149 155
               {
    
    150
    -            offset += (FT_ULong)( p - buffer ) + SFNTS_LEN + 1;
    
    151
    -            goto Found;
    
    156
    +            offset = FT_STREAM_POS() - read_len - read_offset
    
    157
    +                     + (FT_ULong)( p - buffer ) + SFNTS_LEN + 1;
    
    152 158
               }
    
    153 159
             }
    
    154 160
     
    
    155
    -        if ( read_offset + read_len < STARTDATA_LEN )
    
    161
    +        if ( read_offset + read_len <= STARTDATA_LEN )
    
    156 162
             {
    
    157
    -          FT_TRACE2(( "cid_parser_new: no `StartData' keyword found\n" ));
    
    163
    +          if ( offset )
    
    164
    +            goto Found;
    
    165
    +
    
    166
    +          FT_TRACE2(( "cid_parser_new: no `StartData` keyword found\n" ));
    
    158 167
               error = FT_THROW( Invalid_File_Format );
    
    159 168
               goto Exit;
    
    160 169
             }
    
    ... ... @@ -171,9 +180,9 @@
    171 180
         }
    
    172 181
     
    
    173 182
       Found:
    
    174
    -    /* We have found the start of the binary data or the `/sfnts' token. */
    
    175
    -    /* Now rewind and extract the frame corresponding to this PostScript */
    
    176
    -    /* section.                                                          */
    
    183
    +    /* We have found an efficient range to look for the binary data or    */
    
    184
    +    /* `/sfnts' token.  Now rewind and extract the frame corresponding to */
    
    185
    +    /* this PostScript section.                                           */
    
    177 186
     
    
    178 187
         ps_len = offset - base_offset;
    
    179 188
         if ( FT_STREAM_SEEK( base_offset )                  ||
    
    ... ... @@ -187,8 +196,8 @@
    187 196
         parser->root.limit     = parser->root.cursor + ps_len;
    
    188 197
         parser->num_dict       = FT_UINT_MAX;
    
    189 198
     
    
    190
    -    /* Finally, we check whether `StartData' or `/sfnts' was real --  */
    
    191
    -    /* it could be in a comment or string.  We also get the arguments */
    
    199
    +    /* Find the first real `StartData' or `/sfnts' -- the last one    */
    
    200
    +    /* could be in a comment or string.  We also get the arguments    */
    
    192 201
         /* of `StartData' to find out whether the data is represented in  */
    
    193 202
         /* binary or hex format.                                          */
    
    194 203
     
    
    ... ... @@ -216,6 +225,7 @@
    216 225
           {
    
    217 226
             T1_TokenRec  type_token;
    
    218 227
             FT_Long      binary_length;
    
    228
    +        FT_ULong     found_offset;
    
    219 229
     
    
    220 230
     
    
    221 231
             parser->root.cursor = arg1;
    
    ... ... @@ -234,6 +244,24 @@
    234 244
                 parser->binary_length = (FT_ULong)binary_length;
    
    235 245
             }
    
    236 246
     
    
    247
    +        /* set the real values for the parser, if different */
    
    248
    +        found_offset = (FT_ULong)( cur - parser->postscript )
    
    249
    +                       + STARTDATA_LEN + 1;
    
    250
    +        if ( found_offset != offset )
    
    251
    +        {
    
    252
    +          FT_FRAME_RELEASE( parser->postscript );
    
    253
    +
    
    254
    +          ps_len = found_offset - base_offset;
    
    255
    +          if ( FT_STREAM_SEEK( base_offset )                  ||
    
    256
    +               FT_FRAME_EXTRACT( ps_len, parser->postscript ) )
    
    257
    +            goto Exit;
    
    258
    +
    
    259
    +          parser->data_offset    = found_offset;
    
    260
    +          parser->postscript_len = ps_len;
    
    261
    +          parser->root.base      = parser->postscript;
    
    262
    +          parser->root.cursor    = parser->postscript;
    
    263
    +          parser->root.limit     = parser->root.cursor + ps_len;
    
    264
    +        }
    
    237 265
             goto Exit;
    
    238 266
           }
    
    239 267
           else if ( cur[1] == 's'                                   &&
    
    ... ... @@ -251,11 +279,8 @@
    251 279
           cur  = parser->root.cursor;
    
    252 280
         }
    
    253 281
     
    
    254
    -    /* we haven't found the correct `StartData'; go back and continue */
    
    255
    -    /* searching                                                      */
    
    256
    -    FT_FRAME_RELEASE( parser->postscript );
    
    257
    -    if ( !FT_STREAM_SEEK( offset ) )
    
    258
    -      goto Again;
    
    282
    +    FT_TRACE2(( "cid_parser_new: no `StartData` token found\n" ));
    
    283
    +    error = FT_THROW( Invalid_File_Format );
    
    259 284
     
    
    260 285
       Exit:
    
    261 286
         return error;
    


  • reply via email to

    [Prev in Thread] Current Thread [Next in Thread]