The statistics code is known to be very slow, because it relies on
DocIterator to go through the buffer.

This commit introduces a new Statitistics class that encapsulates the
main code, along a virtual method Inset::updateStatistics() that
allows to fine-tune how counting is done inset by inset.

In order to check that it works, the old code is not removed yet, and
Tools>Statistics displays the result of the two methods on the
console. I used in particular the user guide to check that the results
are the same.

The display in the status bar has switched to the new code.

The commit does also some profiling with pmprof and outputs some
timings when exiting LyX. I see consistently that the new code is 3x
faster.

Of course, this duplicated code will be removed when finalizing the
commit.

Please test.

JMarc
From 55846d62d16efa52b406ec441856a1da8a9b6b20 Mon Sep 17 00:00:00 2001
From: Jean-Marc Lasgouttes <lasgout...@lyx.org>
Date: Sun, 21 Jul 2024 22:09:28 +0200
Subject: [PATCH] WIP: rewrite statistics code

The statistics code is known to be very slow, because it relies on
DocIterator to go through the buffer.

This commit introduces a new Statitistics class that encapsulates the
main code, along a virtual method Inset::updateStatistics() that
allows to fine-tune how counting is done inset by inset.

In order to check that it works, the old code is not removed yet, and
Tools>Statistics displays the result of the two methods on the
console. I used in particular the user guide to check that the results
are the same.

The display in the status bar has switched to the new code.

The commit does also some profiling with pmprof and outputs some
timings when exiting LyX. I see consistently that the new code is 3x
faster.

Of course, this duplicated code will be removed when finalizing the
commit.
---
 src/Buffer.cpp                |  12 +++-
 src/Buffer.h                  |   2 +
 src/BufferView.cpp            |  21 ++++++
 src/Makefile.am               |   2 +
 src/Statistics.cpp            | 120 ++++++++++++++++++++++++++++++++++
 src/Statistics.h              |  64 ++++++++++++++++++
 src/frontends/qt/GuiView.cpp  |  23 ++-----
 src/insets/Inset.cpp          |  14 +++-
 src/insets/Inset.h            |   3 +
 src/insets/InsetCitation.cpp  |   7 ++
 src/insets/InsetCitation.h    |   2 +
 src/insets/InsetHyperlink.cpp |   9 ++-
 src/insets/InsetHyperlink.h   |   2 +
 src/insets/InsetQuotes.cpp    |  24 +++++++
 src/insets/InsetQuotes.h      |   2 +
 src/insets/InsetTabular.cpp   |   8 +++
 src/insets/InsetTabular.h     |   3 +
 src/insets/InsetText.cpp      |   7 ++
 src/insets/InsetText.h        |   3 +
 19 files changed, 309 insertions(+), 19 deletions(-)
 create mode 100644 src/Statistics.cpp
 create mode 100644 src/Statistics.h

diff --git a/src/Buffer.cpp b/src/Buffer.cpp
index e95d0edcc8..2def274a79 100644
--- a/src/Buffer.cpp
+++ b/src/Buffer.cpp
@@ -56,7 +56,7 @@
 #include "PDFOptions.h"
 #include "Session.h"
 #include "SpellChecker.h"
-#include "xml.h"
+#include "Statistics.h"
 #include "texstream.h"
 #include "TexRow.h"
 #include "Text.h"
@@ -66,6 +66,7 @@
 #include "VCBackend.h"
 #include "version.h"
 #include "WordLangTuple.h"
+#include "xml.h"
 
 #include "insets/InsetBranch.h"
 #include "insets/InsetInclude.h"
@@ -357,6 +358,9 @@ public:
 	///
 	mutable bool need_update;
 
+	///
+	Statistics statistics_;
+
 private:
 	int word_count_;
 	int char_count_;
@@ -5544,6 +5548,12 @@ int Buffer::charCount(bool with_blanks) const
 }
 
 
+Statistics & Buffer::statistics()
+{
+	return d->statistics_;
+}
+
+
 bool Buffer::areChangesPresent() const
 {
 	return inset().isChanged();
diff --git a/src/Buffer.h b/src/Buffer.h
index d9e7e325d4..17a8fbbc87 100644
--- a/src/Buffer.h
+++ b/src/Buffer.h
@@ -51,6 +51,7 @@ class otexstream;
 class ParagraphList;
 class ParIterator;
 class ParConstIterator;
+class Statistics;
 class TeXErrors;
 class TexRow;
 class TocBackend;
@@ -793,6 +794,7 @@ public:
 	/// statistics accessor functions
 	int wordCount() const;
 	int charCount(bool with_blanks) const;
+	Statistics & statistics();
 
 	///
 	bool areChangesPresent() const;
diff --git a/src/BufferView.cpp b/src/BufferView.cpp
index ce315a6f49..cfce928da2 100644
--- a/src/BufferView.cpp
+++ b/src/BufferView.cpp
@@ -38,6 +38,7 @@
 #include "MetricsInfo.h"
 #include "Paragraph.h"
 #include "Session.h"
+#include "Statistics.h"
 #include "texstream.h"
 #include "Text.h"
 #include "TextMetrics.h"
@@ -78,6 +79,7 @@
 #include "support/Lexer.h"
 #include "support/lstrings.h"
 #include "support/lyxlib.h"
+#include "support/pmprof.h"
 #include "support/types.h"
 
 #include <algorithm>
@@ -2014,6 +2016,7 @@ void BufferView::dispatch(FuncRequest const & cmd, DispatchResult & dr)
 			from = doc_iterator_begin(&buffer_);
 			to = doc_iterator_end(&buffer_);
 		}
+#if 0
 		buffer_.updateStatistics(from, to);
 		int const words = buffer_.wordCount();
 		int const chars = buffer_.charCount(false);
@@ -2040,6 +2043,24 @@ void BufferView::dispatch(FuncRequest const & cmd, DispatchResult & dr)
 			message += _("One character (no blanks)");
 
 		Alert::information(_("Statistics"), message);
+#else
+		{
+			PROFILE_THIS_BLOCK(old_updateStatistics);
+			buffer_.updateStatistics(from, to);
+			LYXERR0("OLD: " << "word=" << buffer_.wordCount()
+			                << ", char=" << buffer_.charCount(false)
+			                << ", char+sp=" << buffer_.charCount(true));
+		}
+#endif
+		{
+			PROFILE_THIS_BLOCK(new_updateStatistics);
+			Statistics & stats = buffer_.statistics();
+			stats.update(cur);
+			LYXERR0("NEW: " << "word=" << stats.word_count
+			                << ", char=" << stats.char_count
+			                << ", char+sp=" << stats.char_count + stats.blank_count);
+		}
+
 	}
 		break;
 
diff --git a/src/Makefile.am b/src/Makefile.am
index d6c9a0cb18..fa1161b71a 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -173,6 +173,7 @@ SOURCEFILESCORE = \
 	ServerSocket.cpp \
 	Session.cpp \
 	Spacing.cpp \
+	Statistics.cpp \
 	TexRow.cpp \
 	texstream.cpp \
 	Text.cpp \
@@ -277,6 +278,7 @@ HEADERFILESCORE = \
 	Session.h \
 	Spacing.h \
 	SpellChecker.h \
+	Statistics.h \
 	TexRow.h \
 	texstream.h \
 	Text.h \
diff --git a/src/Statistics.cpp b/src/Statistics.cpp
new file mode 100644
index 0000000000..eabae8f399
--- /dev/null
+++ b/src/Statistics.cpp
@@ -0,0 +1,120 @@
+// -*- C++ -*-
+/**
+ * \file Statistics.cpp
+ * This file is part of LyX, the document processor.
+ * Licence details can be found in the file COPYING.
+ *
+ * \author Jean-Marc Lasgouttes
+ *
+ * Full author contact details are available in file CREDITS.
+ */
+
+#include <config.h>
+
+#include "Statistics.h"
+
+#include "Paragraph.h"
+#include "Text.h"
+#include "Cursor.h"
+
+#include "support/lassert.h"
+#include "support/lstrings.h"
+#include "support/textutils.h"
+
+
+namespace lyx {
+
+using namespace support;
+
+
+void Statistics::update(CursorData const & cur)
+{
+	// reset counts
+	*this = Statistics();
+	if (cur.selection()) {
+		if (cur.inMathed())
+			return;
+		CursorSlice from, to;
+		from = cur.selBegin();
+		to = cur.selEnd();
+		update(from, to);
+	} else
+		update(*cur.bottom().text());
+}
+
+
+void Statistics::update(docstring const & s)
+{
+	// FIXME: use a stripped-down version of the paragraph code.
+	// This is the original code from InsetCitation::isWords()
+	char_count += s.size();
+	// FIXME: this does not count words properly
+	word_count += wordCount(s);
+	// FIXME: spaces are not counted
+}
+
+
+void Statistics::update(Text const & text)
+{
+	for (Paragraph const & par : text.paragraphs())
+		update(par);
+}
+
+
+void Statistics::update(CursorSlice const & from, CursorSlice & to)
+{
+	LASSERT(from.text() == to.text(), return);
+	if (from.idx() == to.idx()) {
+		if (from.pit() == to.pit()) {
+			update(from.paragraph(), from.pos(), to.pos());
+		} else {
+			pos_type frompos = from.pos();
+			for (pit_type pit = from.pit() ; pit < to.pit() ; ++pit) {
+				update(from.text()->getPar(pit), frompos);
+				frompos = 0;
+			}
+			update(to.paragraph(), 0, to.pos());
+		}
+	} else
+		for (idx_type idx = from.idx() ; idx <= to.idx(); ++idx)
+			update(*from.inset().getText(idx));
+}
+
+
+void Statistics::update(Paragraph const & par, pos_type from, pos_type to)
+{
+	if (to == -1)
+		to = par.size();
+
+	for (pos_type pos = from ; pos < to ; ++pos) {
+		Inset const * ins = par.isInset(pos) ? par.getInset(pos) : nullptr;
+		// Stuff that we skip
+		if (par.isDeleted(pos))
+			continue;
+		if (ins && skip_no_output && !ins->producesOutput())
+			continue;
+
+		// words
+		if (par.isWordSeparator(pos))
+			inword = false;
+		else if (!inword) {
+			++word_count;
+			inword = true;
+		}
+
+		if (ins)
+			ins->updateStatistics(*this);
+		else {
+			char_type const c = par.getChar(pos);
+			if (isPrintableNonspace(c))
+				++char_count;
+			else if (lyx::isSpace(c))
+				++blank_count;
+		}
+	}
+	inword = false;
+}
+
+
+} // namespace lyx
+
diff --git a/src/Statistics.h b/src/Statistics.h
new file mode 100644
index 0000000000..7439de06cf
--- /dev/null
+++ b/src/Statistics.h
@@ -0,0 +1,64 @@
+// -*- C++ -*-
+/**
+ * \file Statistics.h
+ * This file is part of LyX, the document processor.
+ * Licence details can be found in the file COPYING.
+ *
+ * \author Jean-Marc Lasgouttes
+ *
+ * Full author contact details are available in file CREDITS.
+ */
+
+#ifndef STATISTICS_H
+#define STATISTICS_H
+
+#include "support/strfwd.h"
+#include "support/types.h"
+
+namespace lyx {
+
+class CursorData;
+class CursorSlice;
+class Text;
+class Paragraph;
+
+// Class used to compute letters/words statistics on buffer or selection
+class Statistics {
+public:
+	// Number of words
+	int word_count = 0;
+	// Number of non blank characters
+	int char_count = 0;
+	// Number of blank characters
+	int blank_count = 0;
+	// Indicate whether parts that are not output should be counted.
+	bool skip_no_output = true;
+
+	/// Count characters in the whole document, or in the selection if
+	/// there is one. This is the main entry point.
+	void update(CursorData const & cur);
+	///  Count chars and words in this string
+	void update(docstring const & s);
+	/// Count chars and words in the paragraphs of \c text
+	void update(Text const & text);
+
+private:
+
+	/// Count chars and words between two positions
+	void update(CursorSlice const & from, CursorSlice & to);
+
+	/** Count chars and words in a paragraph
+	 * \param par: the paragraph
+	 * \param from: starting position
+	 * \param to: end position. If it is equal to -1, then the end is
+	 *    the end of the paragraph.
+	 */
+	void update(Paragraph const & par, pos_type from = 0, pos_type to = -1);
+
+	// Used in the code to track status
+	bool inword = false;
+};
+
+}
+
+#endif // STATISTICS_H
diff --git a/src/frontends/qt/GuiView.cpp b/src/frontends/qt/GuiView.cpp
index 3b24846852..a6b48b3053 100644
--- a/src/frontends/qt/GuiView.cpp
+++ b/src/frontends/qt/GuiView.cpp
@@ -60,8 +60,9 @@
 #include "LyXRC.h"
 #include "LyXVC.h"
 #include "Paragraph.h"
-#include "SpellChecker.h"
 #include "Session.h"
+#include "SpellChecker.h"
+#include "Statistics.h"
 #include "TexRow.h"
 #include "Text.h"
 #include "Toolbars.h"
@@ -1478,31 +1479,21 @@ void GuiView::showStats()
 	if (d.time_to_update > 0)
 		return;
 
-	DocIterator from, to;
-	if (cur.selection()) {
-		from = cur.selectionBegin();
-		to = cur.selectionEnd();
-		d.already_in_selection_ = true;
-	} else {
-		from = doc_iterator_begin(buf);
-		to = doc_iterator_end(buf);
-		d.already_in_selection_ = false;
-	}
-
 	// Don't attempt to calculate stats if
 	// the buffer is busy as this might crash (#12935)
+	Statistics & statistics = buf->statistics();
 	if (!busy() && !bv->busy())
-		buf->updateStatistics(from, to);
+		statistics.update(cur);
 
 	QStringList stats;
 	if (word_count_enabled_) {
-		int const words = buf->wordCount() - bv->stats_ref_value_w();
+		int const words = statistics.word_count - bv->stats_ref_value_w();
 		if (words == 1)
 			stats << toqstr(bformat(_("%1$d Word"), words));
 		else
 			stats << toqstr(bformat(_("%1$d Words"), words));
 	}
-	int const chars_with_blanks = buf->charCount(true);
+	int const chars_with_blanks = statistics.char_count + statistics.blank_count;
 	if (char_count_enabled_) {
 		int const chars_with_blanks_disp = chars_with_blanks - bv->stats_ref_value_c();
 		if (chars_with_blanks == 1)
@@ -1511,7 +1502,7 @@ void GuiView::showStats()
 			stats << toqstr(bformat(_("%1$d Characters"), chars_with_blanks_disp));
 	}
 	if (char_nb_count_enabled_) {
-		int const chars = buf->charCount(false) - bv->stats_ref_value_nb();
+		int const chars = statistics.char_count - bv->stats_ref_value_nb();
 		if (chars == 1)
 			stats << toqstr(bformat(_("%1$d Character (no Blanks)"), chars));
 		else
diff --git a/src/insets/Inset.cpp b/src/insets/Inset.cpp
index b949f9c989..c3efc2b5f2 100644
--- a/src/insets/Inset.cpp
+++ b/src/insets/Inset.cpp
@@ -30,10 +30,11 @@
 #include "InsetLayout.h"
 #include "MetricsInfo.h"
 #include "output_xhtml.h"
-#include "xml.h"
+#include "Statistics.h"
 #include "Text.h"
 #include "TextClass.h"
 #include "TocBackend.h"
+#include "xml.h"
 
 #include "frontends/Application.h"
 #include "frontends/Painter.h"
@@ -626,6 +627,17 @@ bool Inset::undefined() const
 }
 
 
+void Inset::updateStatistics(Statistics & stats) const
+{
+	if (isLetter()) {
+		odocstringstream os;
+		toString(os);
+		stats.char_count += os.str().length();
+	} else if (isSpace())
+		++stats.blank_count;
+}
+
+
 CtObject Inset::getCtObject(OutputParams const &) const
 {
 	return CtObject::Normal;
diff --git a/src/insets/Inset.h b/src/insets/Inset.h
index ea8f00e38c..4140f5319c 100644
--- a/src/insets/Inset.h
+++ b/src/insets/Inset.h
@@ -60,6 +60,7 @@ class MathAtom;
 class MetricsInfo;
 class PainterInfo;
 class ParIterator;
+class Statistics;
 class Text;
 class TocBackend;
 class XMLStream;
@@ -483,6 +484,8 @@ public:
 	/// returns chars, words if the inset is equivalent to such, otherwise
 	/// (0,0), which should be interpreted as 'false'
 	virtual std::pair<int, int> isWords() const { return std::pair<int,int>(0, 0); }
+	/// Count words, characters and spaces in inset
+	virtual void updateStatistics(Statistics & stats) const;
 	/// does this inset try to use all available space (like \\hfill does)?
 	virtual bool isHfill() const { return false; }
 
diff --git a/src/insets/InsetCitation.cpp b/src/insets/InsetCitation.cpp
index 87130462e0..7f51092754 100644
--- a/src/insets/InsetCitation.cpp
+++ b/src/insets/InsetCitation.cpp
@@ -29,6 +29,7 @@
 #include "output_xhtml.h"
 #include "output_docbook.h"
 #include "ParIterator.h"
+#include "Statistics.h"
 #include "texstream.h"
 #include "TocBackend.h"
 
@@ -798,6 +799,12 @@ pair<int, int> InsetCitation::isWords() const
 }
 
 
+void InsetCitation::updateStatistics(Statistics & stats) const
+{
+	stats.update(generateLabel(false));
+}
+
+
 string InsetCitation::contextMenuName() const
 {
 	return "context-citation";
diff --git a/src/insets/InsetCitation.h b/src/insets/InsetCitation.h
index b1b27e0291..b8e1f8ed17 100644
--- a/src/insets/InsetCitation.h
+++ b/src/insets/InsetCitation.h
@@ -105,6 +105,8 @@ public:
 	void openCitation();
 	///
 	std::pair<int, int> isWords() const override;
+	///
+	void updateStatistics(Statistics & stats) const override;
 
 private:
 	/// tries to make a pretty label and makes a basic one if not
diff --git a/src/insets/InsetHyperlink.cpp b/src/insets/InsetHyperlink.cpp
index dcdcbbb8e1..7ea02fc1d4 100644
--- a/src/insets/InsetHyperlink.cpp
+++ b/src/insets/InsetHyperlink.cpp
@@ -20,8 +20,9 @@
 #include "LyX.h"
 #include "output_docbook.h"
 #include "output_xhtml.h"
-#include "xml.h"
+#include "Statistics.h"
 #include "texstream.h"
+#include "xml.h"
 
 #include "support/debug.h"
 #include "support/docstream.h"
@@ -306,6 +307,12 @@ pair<int, int> InsetHyperlink::isWords() const
 }
 
 
+void InsetHyperlink::updateStatistics(Statistics & stats) const
+{
+	stats.update(getParam("name"));
+}
+
+
 string InsetHyperlink::contextMenuName() const
 {
 	return "context-hyperlink";
diff --git a/src/insets/InsetHyperlink.h b/src/insets/InsetHyperlink.h
index 36bd249902..c7f1063a62 100644
--- a/src/insets/InsetHyperlink.h
+++ b/src/insets/InsetHyperlink.h
@@ -56,6 +56,8 @@ public:
 	docstring xhtml(XMLStream &, OutputParams const &) const override;
 	///
 	std::pair<int, int> isWords() const override;
+	///
+	void updateStatistics(Statistics & stats) const override;
 	//@}
 
 	/// \name Static public methods obligated for InsetCommand derived classes
diff --git a/src/insets/InsetQuotes.cpp b/src/insets/InsetQuotes.cpp
index 8d6caf1e5e..53bc297164 100644
--- a/src/insets/InsetQuotes.cpp
+++ b/src/insets/InsetQuotes.cpp
@@ -28,6 +28,7 @@
 #include "MetricsInfo.h"
 #include "Paragraph.h"
 #include "ParIterator.h"
+#include "Statistics.h"
 #include "texstream.h"
 #include "xml.h"
 
@@ -1055,4 +1056,27 @@ pair<int, int> InsetQuotes::isWords() const
 	return std::pair<int,int>(length, 0);
 }
 
+void InsetQuotes::updateStatistics(Statistics & stats) const
+{
+	int length = 1;
+	// In PassThru, we use straight quotes otherwise we need to check for French
+	if (!pass_thru_) {
+
+		QuoteStyle style = (style_ == QuoteStyle::Dynamic) ? global_style_ : style_;
+
+		// in French, thin spaces are added inside double guillemets
+		if (level_ == QuoteLevel::Primary
+		    && (style == QuoteStyle::Swiss
+			|| style == QuoteStyle::French
+			|| style == QuoteStyle::FrenchIN)
+		    && prefixIs(context_lang_, "fr")) {
+			// space added by default for all formats
+			length++;
+		}
+	}
+
+	//one or two characters from the statistics perspective
+	stats.char_count += length;
+}
+
 } // namespace lyx
diff --git a/src/insets/InsetQuotes.h b/src/insets/InsetQuotes.h
index 13bd8ea124..015fbd4464 100644
--- a/src/insets/InsetQuotes.h
+++ b/src/insets/InsetQuotes.h
@@ -179,6 +179,8 @@ public:
 	std::string getType() const;
 	///
 	std::pair<int, int> isWords() const override;
+	///
+	void updateStatistics(Statistics & stats) const override;
 
 private:
 	///
diff --git a/src/insets/InsetTabular.cpp b/src/insets/InsetTabular.cpp
index 74a0060aaf..6f1b470ea7 100644
--- a/src/insets/InsetTabular.cpp
+++ b/src/insets/InsetTabular.cpp
@@ -45,6 +45,7 @@
 #include "output_xhtml.h"
 #include "Paragraph.h"
 #include "ParIterator.h"
+#include "Statistics.h"
 #include "TexRow.h"
 #include "texstream.h"
 #include "TextClass.h"
@@ -5017,6 +5018,13 @@ void InsetTabular::updateBuffer(ParIterator const & it, UpdateType utype, bool c
 }
 
 
+void InsetTabular::updateStatistics(Statistics & stats) const
+{
+	for (idx_type idx = 0 ; idx < nargs(); ++idx)
+		stats.update(*getText(idx));
+}
+
+
 void InsetTabular::addToToc(DocIterator const & cpit, bool output_active,
 							UpdateType utype, TocBackend & backend) const
 {
diff --git a/src/insets/InsetTabular.h b/src/insets/InsetTabular.h
index f6939c6eff..8b052c5d73 100644
--- a/src/insets/InsetTabular.h
+++ b/src/insets/InsetTabular.h
@@ -1113,9 +1113,12 @@ public:
 	Inset * editXY(Cursor & cur, int x, int y) override;
 	/// can we go further down on mouse click?
 	bool descendable(BufferView const &) const override { return true; }
+
 	/// Update the counters of this inset and of its contents
 	void updateBuffer(ParIterator const &, UpdateType, bool const deleted = false) override;
 	///
+	void updateStatistics(Statistics & stats) const;
+	///
 	void addToToc(DocIterator const & di, bool output_active,
 				  UpdateType utype, TocBackend & backend) const override;
 
diff --git a/src/insets/InsetText.cpp b/src/insets/InsetText.cpp
index ac3bc8e102..8a3109947f 100644
--- a/src/insets/InsetText.cpp
+++ b/src/insets/InsetText.cpp
@@ -50,6 +50,7 @@
 #include "Paragraph.h"
 #include "ParagraphParameters.h"
 #include "ParIterator.h"
+#include "Statistics.h"
 #include "TexRow.h"
 #include "texstream.h"
 #include "TextClass.h"
@@ -1141,6 +1142,12 @@ void InsetText::updateBuffer(ParIterator const & it, UpdateType utype, bool cons
 }
 
 
+void InsetText::updateStatistics(Statistics & stats) const
+{
+		stats.update(text());
+}
+
+
 void InsetText::toString(odocstream & os) const
 {
 	os << text().asString(0, 1, AS_STR_LABEL | AS_STR_INSETS);
diff --git a/src/insets/InsetText.h b/src/insets/InsetText.h
index 423735a032..f46bf1bbfe 100644
--- a/src/insets/InsetText.h
+++ b/src/insets/InsetText.h
@@ -177,6 +177,9 @@ public:
 
 	/// Update the counters of this inset and of its contents
 	void updateBuffer(ParIterator const &, UpdateType, bool const deleted = false) override;
+	///
+	void updateStatistics(Statistics & stats) const override;
+
 	///
 	void setMacrocontextPositionRecursive(DocIterator const & pos);
 	///
-- 
2.43.0

-- 
lyx-devel mailing list
lyx-devel@lists.lyx.org
http://lists.lyx.org/mailman/listinfo/lyx-devel

Reply via email to