This is the token frequencies I get when loading Userguide.lyx.

3341 \family
1891 \layout
1166 \bar
874 \emph
799 \begin_inset
575 \shape
547 \noun
505 \series
502 \color
492 \SpecialChar
330 \size
107 \end_deeper
107 \begin_deeper
80 \labelwidthstring
68 \backslash
54 \i
39 \hfill
25 \align
21 \newline
16 \added_space_top
15 \added_space_bottom
5 \bibitem
4 \noindent
1 \use_numerical_citations
1 \use_natbib
1 \use_geometry
1 \use_amsmath
1 \tocdepth
1 \the_end
1 \textclass
1 \spacing
1 \secnumdepth
1 \quotes_times
1 \quotes_language
1 \paragraph_separation
1 \papersize
1 \papersides
1 \paperpagestyle
1 \paperpackage
1 \paperorientation
1 \paperfontsize
1 \papercolumns
1 \line_bottom
1 \language
1 \inputencoding
1 \graphics
1 \fontscheme
1 \defskip
1 \begin_preamble


To create this I add:

        {
                ofstream tofs("/tmp/tokens.txt", std::ios_base::app);
                tofs << token << "\n";
        }


To the top of parseSingleLyXformat2Token in buffer.C

/tmp/tokens.txt is parsed by this prog:

bucketcheck.C:

#include <fstream>
#include <iostream>
#include <map>

using namespace std;


int main() 
{
        ifstream ifs("/tmp/realtokens.txt");

        map<string, int> buckets;
        
        string line;
        
        while (getline(ifs, line)) {
                ++buckets[line];
        }

        map<string, int>::const_iterator cit = buckets.begin();
        map<string, int>::const_iterator end = buckets.end();
        
        for (; cit != end; ++cit) {
                cout << cit->second << " " << cit->first << "\n";
        }
}

which pipes its output to this:

egrep "^\\\\" /tmp/tokens.txt > /tmp/realtokens.txt
./bucketcheck | sort -g | tac > /tmp/tokenorder.txt


Results for other largish lyx files would be nice to have.

-- 
        Lgb


Reply via email to