tbo...@web.de writes:

> -static void gather_stats(const char *buf, unsigned long size, struct 
> text_stat *stats)
> +static void gather_stats_partly(const char *buf, unsigned long len,
> +                             struct text_stat *stats, unsigned earlyout)
>  {

I think it is OK not to rename the function (you'd be passing earlyout=0
for callers that want exact stat, right?).

>       unsigned long i;
>  
> -     memset(stats, 0, sizeof(*stats));
> -
> -     for (i = 0; i < size; i++) {
> +     if (!buf || !len)
> +             return;
> +     for (i = 0; i < len; i++) {
>               unsigned char c = buf[i];
>               if (c == '\r') {
> -                     if (i+1 < size && buf[i+1] == '\n') {
> +                     stats->stat_bits |= CONVERT_STAT_BITS_ANY_CR;
> +                     if (i+1 < len && buf[i+1] == '\n') {
>                               stats->crlf++;
>                               i++;
> -                     } else
> +                             stats->stat_bits |= CONVERT_STAT_BITS_TXT_CRLF;
> +                     } else {
>                               stats->lonecr++;
> +                             stats->stat_bits |= CONVERT_STAT_BITS_BIN;
> +                     }
>                       continue;
>               }
>               if (c == '\n') {
>                       stats->lonelf++;
> +                     stats->stat_bits |= CONVERT_STAT_BITS_TXT_LF;
>                       continue;
>               }
>               if (c == 127)
> @@ -67,7 +74,7 @@ static void gather_stats(const char *buf, unsigned long 
> size, struct text_stat *
>                               stats->printable++;
>                               break;
>                       case 0:
> -                             stats->nul++;
> +                             stats->stat_bits |= CONVERT_STAT_BITS_BIN;
>                               /* fall through */
>                       default:
>                               stats->nonprintable++;


So depending on the distribution of the bytes in the file, the
bitfields in stats->stat_bits will be filled one bit at a time in
random order.

> @@ -75,10 +82,12 @@ static void gather_stats(const char *buf, unsigned long 
> size, struct text_stat *
>               }
>               else
>                       stats->printable++;
> +             if (stats->stat_bits & earlyout)
> +                     break; /* We found what we have been searching for */

But an "earlyout" says that if "any" of the earlyout bit is seen, we
can return.

It somehow felt a bit too limited to me in my initial reading, but I
guess I shouldn't be surprised to see that such a limited interface
is sufficient for a file-local helper function ;-).  

The only caller that the semantics of this exit condition matters is
the one that wants to know "do we have NUL or CR anywhere?", so I
guess this should be sufficient.

>       }
>  
>       /* If file ends with EOF then don't count this EOF as non-printable. */
> -     if (size >= 1 && buf[size-1] == '\032')
> +     if (len >= 1 && buf[len-1] == '\032')
>               stats->nonprintable--;

This noise is somewhat irritating.  Was there a reason why size was
a bad name for the variable?

> +static const char *convert_stats_ascii(unsigned convert_stats)
>  {
> -     unsigned int convert_stats = gather_convert_stats(data, size);
> -
> +     const unsigned mask = CONVERT_STAT_BITS_TXT_LF |
> +             CONVERT_STAT_BITS_TXT_CRLF;
>       if (convert_stats & CONVERT_STAT_BITS_BIN)
>               return "-text";
> -     switch (convert_stats) {
> +     switch (convert_stats & mask) {
>       case CONVERT_STAT_BITS_TXT_LF:
>               return "lf";
>       case CONVERT_STAT_BITS_TXT_CRLF:

Subtle.  The caller runs the stat colllection with early-out set to
BITS_BIN, so that this can set "-text" early.  It knows that without
BITS_BIN, the stat was taken for the whole contents and the check lf
or crlf can be reliable.

I wonder if we can/need to do something to remove this subtleness
out of this callchain, which could be a source of confusion.

> @@ -132,24 +162,45 @@ static const char *gather_convert_stats_ascii(const 
> char *data, unsigned long si
>       }
>  }
>  
> +static unsigned get_convert_stats_wt(const char *path)
> +{
> +     struct text_stat stats;
> +     unsigned earlyout = CONVERT_STAT_BITS_BIN;
> +     int fd;
> +     memset(&stats, 0, sizeof(stats));
> +     fd = open(path, O_RDONLY);
> +     if (fd < 0)
> +             return 0;
> +     for (;;) {
> +             char buf[2*1024];

Where is this 2kB come from?  Out of thin air?

Reply via email to