From 72e4f0ce657ca2bc8f597cfa4d7dd86d619e08ce Mon Sep 17 00:00:00 2001
From: Shuo Shen <shuoshen178@gmail.com>
Date: Fri, 9 May 2025 11:06:35 -0700
Subject: [PATCH] lisp/org.el: refactor the org-tag family regex to prevent
 duplicates

* org.el: This is a refactorization patch, and there's no functional
changes.  The goal of patch is to remove duplicates in the tag regex
such that in the future it will be easier to allow new char sets for
tags.  The commit added `org-tag-valid-char-set`,
`org-tag-invalid-char-re`, `org-tag-group-enclosed-re`,
`org-tag-group-optional-re` to be used instead of inline regex.  It
refactored org-tag-re`, `org-tag-group-re`, and `org-tag-line-re` to
build upon the other smaller regex.

* org-bibtex.el: org-element.el: Replaced the inline regex with
predefined consts
---
 lisp/ol-bibtex.el   |  3 ++-
 lisp/org-element.el |  2 +-
 lisp/org.el         | 42 ++++++++++++++++++++++++++++++++----------
 3 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/lisp/ol-bibtex.el b/lisp/ol-bibtex.el
index 37b1cd394..c6cf546bc 100644
--- a/lisp/ol-bibtex.el
+++ b/lisp/ol-bibtex.el
@@ -107,6 +107,7 @@
 
 ;;; Code:
 
+(require 'org)
 (require 'org-macs)
 (org-assert-version)
 
@@ -754,7 +755,7 @@ entry at point."
 			 (funcall
 			  togtag
 			  (replace-regexp-in-string
-			   "[^[:alnum:]_@#%]" ""
+			   org-tag-invalid-char-re ""
 			   (replace-regexp-in-string "[ \t]+" "_" kw))))
 		     (org-bibtex-put (car pair) (cdr pair) insert-raw)))
 	(_ (org-bibtex-put (car pair) (cdr pair) insert-raw))))
diff --git a/lisp/org-element.el b/lisp/org-element.el
index 56c03a0aa..ea31c869f 100644
--- a/lisp/org-element.el
+++ b/lisp/org-element.el
@@ -1348,7 +1348,7 @@ Throw `:org-element-deferred-retry' signal at the end."
                      (skip-chars-forward " \t"))))
 	     (title-start (point))
 	     (tags (when (re-search-forward
-			  "\\(:[[:alnum:]_@#%:]+:\\)[ \t]*$"
+                          (concat org-tag-group-enclosed-re "[ \t]*$")
 			  (line-end-position)
 			  'move)
 		     (goto-char (match-beginning 0))
diff --git a/lisp/org.el b/lisp/org.el
index dfcbe84ff..b17af7217 100644
--- a/lisp/org.el
+++ b/lisp/org.el
@@ -665,16 +665,41 @@ but the stars and the body are.")
 An archived subtree does not open during visibility cycling, and does
 not contribute to the agenda listings.")
 
-(defconst org-tag-re "[[:alnum:]_@#%]+"
+(defconst org-tag-valid-char-set "[:alnum:]_@#%"
+  "[Tag] A string representing the set of characters and character
+classes valid within a tag.  This is the base pattern for tag
+matching regex.")
+
+(defconst org-tag-invalid-char-re
+  (format "[^%s]" org-tag-valid-char-set)
+  "[Tag] Regexp matching a single character that's NOT a valid tag char.")
+
+(defconst org-tag-re (format "[%s]+" org-tag-valid-char-set)
   "Regexp matching a single tag.")
 
-(defconst org-tag-group-re "[ \t]+\\(:\\([[:alnum:]_@#%:]+\\):\\)[ \t]*$"
+(defconst org-tag-group-enclosed-re
+  (format "\\(:\\([%s:]+\\):\\)" org-tag-valid-char-set)
+  "Regex pattern for a colon-enclosed group of tags without matching
+the enclosing spaces and tabs, e.g., \":TAG1:TAG2:\". Match group
+1 stores the tags with the enclosing colons, and match group 2
+stores the tags without the enclosing colons. Built using
+org-tag-valid-char-set with the addition of the colon.")
+
+(defconst org-tag-group-optional-re
+  (concat "\\(?:[ \t]+" org-tag-group-enclosed-re "\\)?[ \t]*$")
+  "Regexp matching an optional tag group at the end of a line,
+ with optional leading and trailing spaces.  If a tag group is
+present, group 1 is the full tag group (with colons), group 2 is
+the tag content (without colons).")
+
+(defconst org-tag-group-re
+  (format "[ \t]+%s[ \t]*$" org-tag-group-enclosed-re)
   "Regexp matching the tag group at the end of a line, with leading spaces.
 Tags are stored in match group 1.  Match group 2 stores the tags
 without the enclosing colons.")
 
 (defconst org-tag-line-re
-  "^\\*+ \\(?:.*[ \t]\\)?\\(:\\([[:alnum:]_@#%:]+\\):\\)[ \t]*$"
+  (format "^\\*+ \\(?:.*[ \t]\\)?%s[ \t]*$" org-tag-group-enclosed-re)
   "Regexp matching tags in a headline.
 Tags are stored in match group 1.  Match group 2 stores the tags
 without the enclosing colons.")
@@ -4522,8 +4547,7 @@ related expressions."
 		      "\\(?: +" org-todo-regexp "\\)?"
 		      "\\(?: +\\(\\[#.\\]\\)\\)?"
 		      "\\(?: +\\(.*?\\)\\)??"
-		      "\\(?:[ \t]+\\(:[[:alnum:]_@#%:]+:\\)\\)?"
-		      "[ \t]*$")
+                      org-tag-group-optional-re)
 	      org-complex-heading-regexp-format
 	      (concat "^\\(\\*+\\)"
 		      "\\(?: +" org-todo-regexp "\\)?"
@@ -4536,14 +4560,12 @@ related expressions."
 		      "\\(%s\\)"
 		      "\\(?: *\\[[0-9%%/]+\\]\\)*"
 		      "\\)"
-		      "\\(?:[ \t]+\\(:[[:alnum:]_@#%%:]+:\\)\\)?"
-		      "[ \t]*$")
+		      org-tag-group-optional-re)
 	      org-todo-line-tags-regexp
 	      (concat "^\\(\\*+\\)"
 		      "\\(?: +" org-todo-regexp "\\)?"
 		      "\\(?: +\\(.*?\\)\\)??"
-		      "\\(?:[ \t]+\\(:[[:alnum:]:_@#%]+:\\)\\)?"
-		      "[ \t]*$"))
+                      org-tag-group-optional-re))
 	(org-compute-latex-and-related-regexp)))))
 
 (defun org-collect-keywords (keywords &optional unique directory)
@@ -12047,7 +12069,7 @@ in Lisp code use `org-set-tags' instead."
 	       (tags
 		(replace-regexp-in-string
 		 ;; Ignore all forbidden characters in tags.
-		 "[^[:alnum:]_@#%]+" ":"
+                 org-tag-invalid-char-re ":"
 		 (if (or (eq t org-use-fast-tag-selection)
 			 (and org-use-fast-tag-selection
 			      (delq nil (mapcar #'cdr table))))
-- 
2.39.5 (Apple Git-154)

