root/trunk/tdtd/tdtd-make-regexp.el

Revision 5, 12.9 KB (checked in by tkg, 3 years ago)

Renamed make-regexp.el to tdtd-make-regexp.el since xslide uses file with same name.

Line 
1;;; tdtd-make-regexp.el --- generate efficient regexps to match strings.
2
3;; Copyright (C) 1994, 1995 Simon Marshall.
4
5;; Author: Simon Marshall <simon@gnu.ai.mit.edu>
6;; Keywords: strings, regexps
7;; Version: 1.02
8
9;; LCD Archive Entry:
10;; make-regexp|Simon Marshall|simon@gnu.ai.mit.edu|
11;; Generate efficient regexps to match strings.|
12;; 11-Jul-1995|1.02|~/functions/make-regexp.el.gz|
13
14;; The archive is archive.cis.ohio-state.edu in /pub/gnu/emacs/elisp-archive.
15
16;;; This file is not part of GNU Emacs.
17
18;;; This program is free software; you can redistribute it and/or modify
19;;; it under the terms of the GNU General Public License as published by
20;;; the Free Software Foundation; either version 2, or (at your option)
21;;; any later version.
22
23;;; This program is distributed in the hope that it will be useful,
24;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
25;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26;;; GNU General Public License for more details.
27
28;;; You should have received a copy of the GNU General Public License
29;;; along with GNU Emacs; see the file COPYING.  If not, write to
30;;; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
31
32;;; Commentary:
33
34;; Purpose:
35;;
36;; To make efficient regexps from lists of strings.
37
38;; For example:
39;;
40;; (let ((strings '("cond" "if" "while" "let\\*?" "prog1" "prog2" "progn"
41;;                  "catch" "throw" "save-restriction" "save-excursion"
42;;                  "save-window-excursion" "save-match-data"
43;;                  "unwind-protect" "condition-case" "track-mouse")))
44;;   (concat "(" (tdtd-make-regexp strings t)))
45;;
46;;      => "(\\(c\\(atch\\|ond\\(\\|ition-case\\)\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(excursion\\|match-data\\|restriction\\|window-excursion\\)\\|t\\(hrow\\|rack-mouse\\)\\|unwind-protect\\|while\\)"
47;;
48;; To search for the above regexp takes about 70% of the time as for the simple
49;; (concat "(\\(" (mapconcat 'identity strings "\\|") "\\)") regexp.
50;;
51;; Obviously, the more the similarity between strings, the faster the regexp:
52;;
53;; (tdtd-make-regexp '("abort" "abs" "accept" "access" "array" "begin" "body" "case"
54;;                "constant" "declare" "delay" "delta" "digits" "else" "elsif"
55;;                "entry" "exception" "exit" "function"  "generic" "goto" "if"
56;;                "others" "limited" "loop" "mod" "new" "null" "out" "subtype"
57;;                "package" "pragma" "private" "procedure" "raise" "range"
58;;                "record" "rem" "renames" "return" "reverse" "select"
59;;                "separate" "task" "terminate" "then" "type" "when" "while"
60;;                "with" "xor"))
61;;
62;;     => "a\\(b\\(ort\\|s\\)\\|cce\\(pt\\|ss\\)\\|rray\\)\\|b\\(egin\\|ody\\)\\|c\\(ase\\|onstant\\)\\|d\\(e\\(clare\\|l\\(ay\\|ta\\)\\)\\|igits\\)\\|e\\(ls\\(e\\|if\\)\\|ntry\\|x\\(ception\\|it\\)\\)\\|function\\|g\\(eneric\\|oto\\)\\|if\\|l\\(imited\\|oop\\)\\|mod\\|n\\(ew\\|ull\\)\\|o\\(thers\\|ut\\)\\|p\\(ackage\\|r\\(agma\\|ivate\\|ocedure\\)\\)\\|r\\(a\\(ise\\|nge\\)\\|e\\(cord\\|m\\|names\\|turn\\|verse\\)\\)\\|s\\(e\\(lect\\|parate\\)\\|ubtype\\)\\|t\\(ask\\|erminate\\|hen\\|ype\\)\\|w\\(h\\(en\\|ile\\)\\|ith\\)\\|xor"
63;;
64;; To search for the above regexp takes less than 60% of the time of the simple
65;; mapconcat equivalent.
66;;
67;; But even small regexps may be worth it:
68;;
69;; (tdtd-make-regexp '("and" "at" "do" "end" "for" "in" "is" "not" "of" "or" "use"))
70;;     => "a\\(nd\\|t\\)\\|do\\|end\\|for\\|i[ns]\\|not\\|o[fr]\\|use"
71;;
72;; as this is 10% faster than the mapconcat equivalent.
73
74;; Installation:
75;;
76;; (autoload 'tdtd-make-regexp "tdtd-make-regexp"
77;;   "Return a regexp to match a string item in STRINGS.")
78;;
79;; (autoload 'tdtd-make-regexps "tdtd-make-regexp"
80;;   "Return a regexp to REGEXPS.")
81;;
82;; Since these functions were written to produce efficient regexps, not regexps
83;; efficiently, it is probably not a good idea to in-line too many calls in
84;; your code, unless you use the following neat trick with `eval-when-compile':
85;;
86;; (defvar definition-regexp
87;;   (let ((regexp (eval-when-compile
88;;                   (tdtd-make-regexp '("defun" "defsubst" "defmacro" "defalias"
89;;                                  "defvar" "defconst" "defadvice") t))))
90;;     (concat "^(" regexp)))
91;;
92;; The `byte-compile' code will be as if you had defined the variable thus:
93;;
94;; (defvar definition-regexp
95;;   "^(\\(def\\(a\\(dvice\\|lias\\)\\|const\\|macro\\|subst\\|un\\|var\\)\\)")
96
97;; Feedback:
98;;
99;; Originally written for font-lock, from an idea from Stig's hl319.
100;; Please don't tell me that it doesn't produce optimal regexps; I know that
101;; already.  But (ideas or) code to improve things (are) is welcome.  Please
102;; test your code and tell me the speed up in searching an appropriate buffer.
103;;
104;; Please send me bug reports, bug fixes, and extensions, etc.
105;; Simon Marshall <simon@gnu.ai.mit.edu>
106
107;; History:
108;;
109;; 1.00--1.01:
110;; - Made `make-regexp' take `lax' to force top-level parentheses.
111;; - Fixed `make-regexps' for MATCH bug and new `font-lock-keywords'.
112;; - Added `unfontify' to user timing functions.
113;; 1.01--1.02:
114;; - Made `make-regexp' `let' a big `max-lisp-eval-depth'.
115
116;; The basic idea is to find the shortest common non-"" prefix each time, and
117;; squirrel it out.  If there is no such prefix, we divide the list into two so
118;; that (at least) one half will have at least a one-character common prefix.
119
120;; In addition, we (a) delay the addition of () parenthesis as long as possible
121;; (until we're sure we need them), and (b) try to squirrel out one-character
122;; sequences (so we can use [] rather than ()).
123
124(defun tdtd-make-regexp (strings &optional paren lax)
125  "Return a regexp to match a string item in STRINGS.
126If optional PAREN non-nil, output regexp parentheses around returned regexp.
127If optional LAX non-nil, don't output parentheses if it doesn't require them.
128Merges keywords to avoid backtracking in Emacs' regexp matcher."
129  (let* ((max-lisp-eval-depth (* 1024 1024))
130         (strings (let ((l strings))    ; Paranoia---make strings unique!
131                    (while l (setq l (setcdr l (delete (car l) (cdr l)))))
132                    (sort strings 'string-lessp)))
133         (open-paren (if paren "\\(" "")) (close-paren (if paren "\\)" ""))
134         (open-lax (if lax "" open-paren)) (close-lax (if lax "" close-paren))
135         (completion-ignore-case nil))
136    (cond
137     ;; If there's only one string, just return it.
138     ((= (length strings) 1)
139      (concat open-lax (car strings) close-lax))
140     ;; If there's an empty string, pull it out.
141     ((string= (car strings) "")
142      (if (and (= (length strings) 2) (= (length (nth 1 strings)) 1))
143          (concat open-lax (nth 1 strings) "?" close-lax)
144        (concat open-paren "\\|" (tdtd-make-regexp (cdr strings)) close-paren)))
145     ;; If there are only one-character strings, make a [] list instead.
146     ((= (length strings) (apply '+ (mapcar 'length strings)))
147      (concat open-lax "[" (mapconcat 'identity strings "") "]" close-lax))
148     (t
149      ;; We have a list of strings.  Is there a common prefix?
150      (let ((prefix (try-completion "" (mapcar 'list strings))))
151        (if (> (length prefix) 0)
152            ;; Common prefix!  Squirrel it out and recurse with the suffixes.
153            (let* ((len (length prefix))
154                   (sufs (mapcar '(lambda (str) (substring str len)) strings)))
155              (concat open-paren prefix (tdtd-make-regexp sufs t t) close-paren))
156          ;; No common prefix.  Is there a one-character sequence?
157          (let ((letters (let ((completion-regexp-list '("^.$")))
158                           (all-completions "" (mapcar 'list strings)))))
159            (if (> (length letters) 1)
160                ;; Do the one-character sequences, then recurse on the rest.
161                (let ((rest (let ((completion-regexp-list '("^..+$")))
162                              (all-completions "" (mapcar 'list strings)))))
163                  (concat open-paren
164                          (tdtd-make-regexp letters) "\\|" (tdtd-make-regexp rest)
165                          close-paren))
166              ;; No one-character sequence, so divide the list into two by
167              ;; dividing into those that start with a particular letter, and
168              ;; those that do not.
169              (let* ((char (substring (car strings) 0 1))
170                     (half1 (all-completions char (mapcar 'list strings)))
171                     (half2 (nthcdr (length half1) strings)))
172                (concat open-paren
173                        (tdtd-make-regexp half1) "\\|" (tdtd-make-regexp half2)
174                        close-paren))))))))))
175
176;; This stuff is realy for font-lock...
177
178;; Ahhh, the wonders of lisp...
179(defun regexp-span (regexp &optional start)
180  "Return the span or depth of REGEXP.
181This means the number of \"\\\\(...\\\\)\" pairs in REGEXP, optionally from START."
182  (let ((match (string-match (regexp-quote "\\(") regexp (or start 0))))
183    (if (not match) 0 (1+ (regexp-span regexp (match-end 0))))))
184
185;; The basic idea is to concat the regexps together, keeping count of the span
186;; of the regexps so that we can get the correct match for hilighting.
187(defun tdtd-make-regexps (&rest regexps)
188  "Return a regexp to match REGEXPS
189Each item of REGEXPS should be of the form:
190
191 STRING                                 ; A STRING to be used literally.
192 (STRING MATCH FACE DATA)               ; Match STRING at depth MATCH with FACE
193                                        ; and highlight according to DATA.
194 (STRINGS FACE DATA)                    ; STRINGS is a list of strings FACE is
195                                        ; to highlight according to DATA.
196
197Returns a list of the form:
198
199 (REGEXP (MATCH FACE DATA) ...)
200
201For example:
202
203 (tdtd-make-regexps \"^(\"
204               '((\"defun\" \"defalias\" \"defsubst\" \"defadvice\") keyword)
205               \"[ \t]*\"
206               '(\"\\\\([a-zA-Z-]+\\\\)?\" 1 function-name nil t))
207
208     =>
209
210 (\"^(\\\\(def\\\\(a\\\\(dvice\\\\|lias\\\\)\\\\|subst\\\\|un\\\\)\\\\)[        ]*\\\\([a-zA-Z-]+\\\\)?\"
211  (1 keyword) (4 function-name nil t))
212
213Uses `tdtd-make-regexp' to make efficient regexps."
214  (let ((regexp "") (data ()))
215    (while regexps
216      (cond ((stringp (car regexps))
217             (setq regexp (concat regexp (car regexps))))
218            ((stringp (nth 0 (car regexps)))
219             (setq data (cons (cons (+ (regexp-span regexp)
220                                       (nth 1 (car regexps)))
221                                    (nthcdr 2 (car regexps)))
222                              data)
223                   regexp (concat regexp (nth 0 (car regexps)))))
224            (t
225             (setq data (cons (cons (1+ (regexp-span regexp))
226                                    (cdr (car regexps)))
227                              data)
228                   regexp (concat regexp (tdtd-make-regexp (nth 0 (car regexps))
229                                                      t)))))
230      (setq regexps (cdr regexps)))
231    (cons regexp (nreverse data))))
232
233;; Crude-rude timing...
234
235(defsubst time-seconds (&optional time)
236  "Return the TIME in seconds, or the current time if not given.
237TIME should be the same format as produced by `current-time'."
238  (let ((time (or time (current-time))))
239    (+ (* (nth 0 time) 65536.0) (nth 1 time) (/ (nth 2 time) 1000000.0))))
240
241(defsubst time-since (time)
242  "Return the time in seconds since TIME.
243TIME should be the value of `current-time' or `time-seconds'."
244  (- (time-seconds) (if (floatp time) time (time-seconds time))))
245
246(defun time-function (func &rest args)
247  "Return the time in seconds taken to execute FUNC with ARGS.
248Returned is actually the cons pair (func-value . time)."
249  (garbage-collect)
250  (let ((start (time-seconds)))
251    (cons (apply func args) (time-since start))))
252
253(defun time-regexps (regexps &optional buffer unfontify)
254  "Return corresponding list of times to fontify using REGEXPS.
255Fontify using BUFFER, if non-nil, and UNFONTIFY first, if non-nil."
256  (save-excursion
257    (and buffer (set-buffer buffer))
258    (let ((beg (point-min)) (end (point-max)))
259      (and unfontify (font-lock-unfontify-region beg end))
260      (mapcar (function (lambda (regexp)
261               (let ((font-lock-keywords (list regexp)))
262                 (cons (cdr (time-function 'font-lock-hack-keywords beg end))
263                       regexp))))
264              regexps))))
265
266(defun sort-font-lock-regexps (regexps &optional buffer unfontify)
267  "Return sorted times to fontify syntactically and using REGEXPS.
268UNFONTIFY first, if non-nil."
269  (let ((regexp-time (time-regexps regexps buffer unfontify)))
270    (cons (list (apply '+ (mapcar 'car regexp-time)) 'regexps)
271          (nreverse (sort regexp-time 'car-less-than-car)))))
272
273(defun time-fontification (&optional buffer unfontify)
274  "Return time to fontify syntactically.
275UNFONTIFY first, if non-nil."
276  (save-excursion
277    (and buffer (set-buffer buffer))
278    (let ((beg (point-min)) (end (point-max)))
279      (and unfontify (font-lock-unfontify-region beg end))
280      (cdr (time-function 'font-lock-fontify-region beg end)))))
281
282(defun sort-font-lock-fontification (regexps &optional buffer unfontify)
283  "Return sorted times to fontify syntactically and using REGEXPS.
284UNFONTIFY first, if non-nil."
285  (let ((syntactic-time (time-fontification buffer unfontify))
286        (regexp-time (time-regexps regexps buffer)))
287    (nreverse
288     (sort (append (list (list syntactic-time 'syntactic)
289                         (list (apply '+ (mapcar 'car regexp-time)) 'regexps))
290                   regexp-time)
291           'car-less-than-car))))
292
293;;; tdtd-make-regexp.el ends here
Note: See TracBrowser for help on using the browser.