root/trunk/tdtd/tdtd-make-regexp.el
| Revision 5, 12.9 KB (checked in by tkg, 3 years ago) |
|---|
| Line | |
|---|---|
| 1 | ;;; tdtd-make-regexp.el --- generate efficient regexps to match strings. |
| 2 | |
| 3 | ;; Copyright (C) 1994, 1995 Simon Marshall. |
| 4 | |
| 5 | ;; Author: Simon Marshall <simon@gnu.ai.mit.edu> |
| 6 | ;; Keywords: strings, regexps |
| 7 | ;; Version: 1.02 |
| 8 | |
| 9 | ;; LCD Archive Entry: |
| 10 | ;; make-regexp|Simon Marshall|simon@gnu.ai.mit.edu| |
| 11 | ;; Generate efficient regexps to match strings.| |
| 12 | ;; 11-Jul-1995|1.02|~/functions/make-regexp.el.gz| |
| 13 | |
| 14 | ;; The archive is archive.cis.ohio-state.edu in /pub/gnu/emacs/elisp-archive. |
| 15 | |
| 16 | ;;; This file is not part of GNU Emacs. |
| 17 | |
| 18 | ;;; This program is free software; you can redistribute it and/or modify |
| 19 | ;;; it under the terms of the GNU General Public License as published by |
| 20 | ;;; the Free Software Foundation; either version 2, or (at your option) |
| 21 | ;;; any later version. |
| 22 | |
| 23 | ;;; This program is distributed in the hope that it will be useful, |
| 24 | ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 25 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 26 | ;;; GNU General Public License for more details. |
| 27 | |
| 28 | ;;; You should have received a copy of the GNU General Public License |
| 29 | ;;; along with GNU Emacs; see the file COPYING. If not, write to |
| 30 | ;;; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. |
| 31 | |
| 32 | ;;; Commentary: |
| 33 | |
| 34 | ;; Purpose: |
| 35 | ;; |
| 36 | ;; To make efficient regexps from lists of strings. |
| 37 | |
| 38 | ;; For example: |
| 39 | ;; |
| 40 | ;; (let ((strings '("cond" "if" "while" "let\\*?" "prog1" "prog2" "progn" |
| 41 | ;; "catch" "throw" "save-restriction" "save-excursion" |
| 42 | ;; "save-window-excursion" "save-match-data" |
| 43 | ;; "unwind-protect" "condition-case" "track-mouse"))) |
| 44 | ;; (concat "(" (tdtd-make-regexp strings t))) |
| 45 | ;; |
| 46 | ;; => "(\\(c\\(atch\\|ond\\(\\|ition-case\\)\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(excursion\\|match-data\\|restriction\\|window-excursion\\)\\|t\\(hrow\\|rack-mouse\\)\\|unwind-protect\\|while\\)" |
| 47 | ;; |
| 48 | ;; To search for the above regexp takes about 70% of the time as for the simple |
| 49 | ;; (concat "(\\(" (mapconcat 'identity strings "\\|") "\\)") regexp. |
| 50 | ;; |
| 51 | ;; Obviously, the more the similarity between strings, the faster the regexp: |
| 52 | ;; |
| 53 | ;; (tdtd-make-regexp '("abort" "abs" "accept" "access" "array" "begin" "body" "case" |
| 54 | ;; "constant" "declare" "delay" "delta" "digits" "else" "elsif" |
| 55 | ;; "entry" "exception" "exit" "function" "generic" "goto" "if" |
| 56 | ;; "others" "limited" "loop" "mod" "new" "null" "out" "subtype" |
| 57 | ;; "package" "pragma" "private" "procedure" "raise" "range" |
| 58 | ;; "record" "rem" "renames" "return" "reverse" "select" |
| 59 | ;; "separate" "task" "terminate" "then" "type" "when" "while" |
| 60 | ;; "with" "xor")) |
| 61 | ;; |
| 62 | ;; => "a\\(b\\(ort\\|s\\)\\|cce\\(pt\\|ss\\)\\|rray\\)\\|b\\(egin\\|ody\\)\\|c\\(ase\\|onstant\\)\\|d\\(e\\(clare\\|l\\(ay\\|ta\\)\\)\\|igits\\)\\|e\\(ls\\(e\\|if\\)\\|ntry\\|x\\(ception\\|it\\)\\)\\|function\\|g\\(eneric\\|oto\\)\\|if\\|l\\(imited\\|oop\\)\\|mod\\|n\\(ew\\|ull\\)\\|o\\(thers\\|ut\\)\\|p\\(ackage\\|r\\(agma\\|ivate\\|ocedure\\)\\)\\|r\\(a\\(ise\\|nge\\)\\|e\\(cord\\|m\\|names\\|turn\\|verse\\)\\)\\|s\\(e\\(lect\\|parate\\)\\|ubtype\\)\\|t\\(ask\\|erminate\\|hen\\|ype\\)\\|w\\(h\\(en\\|ile\\)\\|ith\\)\\|xor" |
| 63 | ;; |
| 64 | ;; To search for the above regexp takes less than 60% of the time of the simple |
| 65 | ;; mapconcat equivalent. |
| 66 | ;; |
| 67 | ;; But even small regexps may be worth it: |
| 68 | ;; |
| 69 | ;; (tdtd-make-regexp '("and" "at" "do" "end" "for" "in" "is" "not" "of" "or" "use")) |
| 70 | ;; => "a\\(nd\\|t\\)\\|do\\|end\\|for\\|i[ns]\\|not\\|o[fr]\\|use" |
| 71 | ;; |
| 72 | ;; as this is 10% faster than the mapconcat equivalent. |
| 73 | |
| 74 | ;; Installation: |
| 75 | ;; |
| 76 | ;; (autoload 'tdtd-make-regexp "tdtd-make-regexp" |
| 77 | ;; "Return a regexp to match a string item in STRINGS.") |
| 78 | ;; |
| 79 | ;; (autoload 'tdtd-make-regexps "tdtd-make-regexp" |
| 80 | ;; "Return a regexp to REGEXPS.") |
| 81 | ;; |
| 82 | ;; Since these functions were written to produce efficient regexps, not regexps |
| 83 | ;; efficiently, it is probably not a good idea to in-line too many calls in |
| 84 | ;; your code, unless you use the following neat trick with `eval-when-compile': |
| 85 | ;; |
| 86 | ;; (defvar definition-regexp |
| 87 | ;; (let ((regexp (eval-when-compile |
| 88 | ;; (tdtd-make-regexp '("defun" "defsubst" "defmacro" "defalias" |
| 89 | ;; "defvar" "defconst" "defadvice") t)))) |
| 90 | ;; (concat "^(" regexp))) |
| 91 | ;; |
| 92 | ;; The `byte-compile' code will be as if you had defined the variable thus: |
| 93 | ;; |
| 94 | ;; (defvar definition-regexp |
| 95 | ;; "^(\\(def\\(a\\(dvice\\|lias\\)\\|const\\|macro\\|subst\\|un\\|var\\)\\)") |
| 96 | |
| 97 | ;; Feedback: |
| 98 | ;; |
| 99 | ;; Originally written for font-lock, from an idea from Stig's hl319. |
| 100 | ;; Please don't tell me that it doesn't produce optimal regexps; I know that |
| 101 | ;; already. But (ideas or) code to improve things (are) is welcome. Please |
| 102 | ;; test your code and tell me the speed up in searching an appropriate buffer. |
| 103 | ;; |
| 104 | ;; Please send me bug reports, bug fixes, and extensions, etc. |
| 105 | ;; Simon Marshall <simon@gnu.ai.mit.edu> |
| 106 | |
| 107 | ;; History: |
| 108 | ;; |
| 109 | ;; 1.00--1.01: |
| 110 | ;; - Made `make-regexp' take `lax' to force top-level parentheses. |
| 111 | ;; - Fixed `make-regexps' for MATCH bug and new `font-lock-keywords'. |
| 112 | ;; - Added `unfontify' to user timing functions. |
| 113 | ;; 1.01--1.02: |
| 114 | ;; - Made `make-regexp' `let' a big `max-lisp-eval-depth'. |
| 115 | |
| 116 | ;; The basic idea is to find the shortest common non-"" prefix each time, and |
| 117 | ;; squirrel it out. If there is no such prefix, we divide the list into two so |
| 118 | ;; that (at least) one half will have at least a one-character common prefix. |
| 119 | |
| 120 | ;; In addition, we (a) delay the addition of () parenthesis as long as possible |
| 121 | ;; (until we're sure we need them), and (b) try to squirrel out one-character |
| 122 | ;; sequences (so we can use [] rather than ()). |
| 123 | |
| 124 | (defun tdtd-make-regexp (strings &optional paren lax) |
| 125 | "Return a regexp to match a string item in STRINGS. |
| 126 | If optional PAREN non-nil, output regexp parentheses around returned regexp. |
| 127 | If optional LAX non-nil, don't output parentheses if it doesn't require them. |
| 128 | Merges keywords to avoid backtracking in Emacs' regexp matcher." |
| 129 | (let* ((max-lisp-eval-depth (* 1024 1024)) |
| 130 | (strings (let ((l strings)) ; Paranoia---make strings unique! |
| 131 | (while l (setq l (setcdr l (delete (car l) (cdr l))))) |
| 132 | (sort strings 'string-lessp))) |
| 133 | (open-paren (if paren "\\(" "")) (close-paren (if paren "\\)" "")) |
| 134 | (open-lax (if lax "" open-paren)) (close-lax (if lax "" close-paren)) |
| 135 | (completion-ignore-case nil)) |
| 136 | (cond |
| 137 | ;; If there's only one string, just return it. |
| 138 | ((= (length strings) 1) |
| 139 | (concat open-lax (car strings) close-lax)) |
| 140 | ;; If there's an empty string, pull it out. |
| 141 | ((string= (car strings) "") |
| 142 | (if (and (= (length strings) 2) (= (length (nth 1 strings)) 1)) |
| 143 | (concat open-lax (nth 1 strings) "?" close-lax) |
| 144 | (concat open-paren "\\|" (tdtd-make-regexp (cdr strings)) close-paren))) |
| 145 | ;; If there are only one-character strings, make a [] list instead. |
| 146 | ((= (length strings) (apply '+ (mapcar 'length strings))) |
| 147 | (concat open-lax "[" (mapconcat 'identity strings "") "]" close-lax)) |
| 148 | (t |
| 149 | ;; We have a list of strings. Is there a common prefix? |
| 150 | (let ((prefix (try-completion "" (mapcar 'list strings)))) |
| 151 | (if (> (length prefix) 0) |
| 152 | ;; Common prefix! Squirrel it out and recurse with the suffixes. |
| 153 | (let* ((len (length prefix)) |
| 154 | (sufs (mapcar '(lambda (str) (substring str len)) strings))) |
| 155 | (concat open-paren prefix (tdtd-make-regexp sufs t t) close-paren)) |
| 156 | ;; No common prefix. Is there a one-character sequence? |
| 157 | (let ((letters (let ((completion-regexp-list '("^.$"))) |
| 158 | (all-completions "" (mapcar 'list strings))))) |
| 159 | (if (> (length letters) 1) |
| 160 | ;; Do the one-character sequences, then recurse on the rest. |
| 161 | (let ((rest (let ((completion-regexp-list '("^..+$"))) |
| 162 | (all-completions "" (mapcar 'list strings))))) |
| 163 | (concat open-paren |
| 164 | (tdtd-make-regexp letters) "\\|" (tdtd-make-regexp rest) |
| 165 | close-paren)) |
| 166 | ;; No one-character sequence, so divide the list into two by |
| 167 | ;; dividing into those that start with a particular letter, and |
| 168 | ;; those that do not. |
| 169 | (let* ((char (substring (car strings) 0 1)) |
| 170 | (half1 (all-completions char (mapcar 'list strings))) |
| 171 | (half2 (nthcdr (length half1) strings))) |
| 172 | (concat open-paren |
| 173 | (tdtd-make-regexp half1) "\\|" (tdtd-make-regexp half2) |
| 174 | close-paren)))))))))) |
| 175 | |
| 176 | ;; This stuff is realy for font-lock... |
| 177 | |
| 178 | ;; Ahhh, the wonders of lisp... |
| 179 | (defun regexp-span (regexp &optional start) |
| 180 | "Return the span or depth of REGEXP. |
| 181 | This means the number of \"\\\\(...\\\\)\" pairs in REGEXP, optionally from START." |
| 182 | (let ((match (string-match (regexp-quote "\\(") regexp (or start 0)))) |
| 183 | (if (not match) 0 (1+ (regexp-span regexp (match-end 0)))))) |
| 184 | |
| 185 | ;; The basic idea is to concat the regexps together, keeping count of the span |
| 186 | ;; of the regexps so that we can get the correct match for hilighting. |
| 187 | (defun tdtd-make-regexps (&rest regexps) |
| 188 | "Return a regexp to match REGEXPS |
| 189 | Each item of REGEXPS should be of the form: |
| 190 | |
| 191 | STRING ; A STRING to be used literally. |
| 192 | (STRING MATCH FACE DATA) ; Match STRING at depth MATCH with FACE |
| 193 | ; and highlight according to DATA. |
| 194 | (STRINGS FACE DATA) ; STRINGS is a list of strings FACE is |
| 195 | ; to highlight according to DATA. |
| 196 | |
| 197 | Returns a list of the form: |
| 198 | |
| 199 | (REGEXP (MATCH FACE DATA) ...) |
| 200 | |
| 201 | For example: |
| 202 | |
| 203 | (tdtd-make-regexps \"^(\" |
| 204 | '((\"defun\" \"defalias\" \"defsubst\" \"defadvice\") keyword) |
| 205 | \"[ \t]*\" |
| 206 | '(\"\\\\([a-zA-Z-]+\\\\)?\" 1 function-name nil t)) |
| 207 | |
| 208 | => |
| 209 | |
| 210 | (\"^(\\\\(def\\\\(a\\\\(dvice\\\\|lias\\\\)\\\\|subst\\\\|un\\\\)\\\\)[ ]*\\\\([a-zA-Z-]+\\\\)?\" |
| 211 | (1 keyword) (4 function-name nil t)) |
| 212 | |
| 213 | Uses `tdtd-make-regexp' to make efficient regexps." |
| 214 | (let ((regexp "") (data ())) |
| 215 | (while regexps |
| 216 | (cond ((stringp (car regexps)) |
| 217 | (setq regexp (concat regexp (car regexps)))) |
| 218 | ((stringp (nth 0 (car regexps))) |
| 219 | (setq data (cons (cons (+ (regexp-span regexp) |
| 220 | (nth 1 (car regexps))) |
| 221 | (nthcdr 2 (car regexps))) |
| 222 | data) |
| 223 | regexp (concat regexp (nth 0 (car regexps))))) |
| 224 | (t |
| 225 | (setq data (cons (cons (1+ (regexp-span regexp)) |
| 226 | (cdr (car regexps))) |
| 227 | data) |
| 228 | regexp (concat regexp (tdtd-make-regexp (nth 0 (car regexps)) |
| 229 | t))))) |
| 230 | (setq regexps (cdr regexps))) |
| 231 | (cons regexp (nreverse data)))) |
| 232 | |
| 233 | ;; Crude-rude timing... |
| 234 | |
| 235 | (defsubst time-seconds (&optional time) |
| 236 | "Return the TIME in seconds, or the current time if not given. |
| 237 | TIME should be the same format as produced by `current-time'." |
| 238 | (let ((time (or time (current-time)))) |
| 239 | (+ (* (nth 0 time) 65536.0) (nth 1 time) (/ (nth 2 time) 1000000.0)))) |
| 240 | |
| 241 | (defsubst time-since (time) |
| 242 | "Return the time in seconds since TIME. |
| 243 | TIME should be the value of `current-time' or `time-seconds'." |
| 244 | (- (time-seconds) (if (floatp time) time (time-seconds time)))) |
| 245 | |
| 246 | (defun time-function (func &rest args) |
| 247 | "Return the time in seconds taken to execute FUNC with ARGS. |
| 248 | Returned is actually the cons pair (func-value . time)." |
| 249 | (garbage-collect) |
| 250 | (let ((start (time-seconds))) |
| 251 | (cons (apply func args) (time-since start)))) |
| 252 | |
| 253 | (defun time-regexps (regexps &optional buffer unfontify) |
| 254 | "Return corresponding list of times to fontify using REGEXPS. |
| 255 | Fontify using BUFFER, if non-nil, and UNFONTIFY first, if non-nil." |
| 256 | (save-excursion |
| 257 | (and buffer (set-buffer buffer)) |
| 258 | (let ((beg (point-min)) (end (point-max))) |
| 259 | (and unfontify (font-lock-unfontify-region beg end)) |
| 260 | (mapcar (function (lambda (regexp) |
| 261 | (let ((font-lock-keywords (list regexp))) |
| 262 | (cons (cdr (time-function 'font-lock-hack-keywords beg end)) |
| 263 | regexp)))) |
| 264 | regexps)))) |
| 265 | |
| 266 | (defun sort-font-lock-regexps (regexps &optional buffer unfontify) |
| 267 | "Return sorted times to fontify syntactically and using REGEXPS. |
| 268 | UNFONTIFY first, if non-nil." |
| 269 | (let ((regexp-time (time-regexps regexps buffer unfontify))) |
| 270 | (cons (list (apply '+ (mapcar 'car regexp-time)) 'regexps) |
| 271 | (nreverse (sort regexp-time 'car-less-than-car))))) |
| 272 | |
| 273 | (defun time-fontification (&optional buffer unfontify) |
| 274 | "Return time to fontify syntactically. |
| 275 | UNFONTIFY first, if non-nil." |
| 276 | (save-excursion |
| 277 | (and buffer (set-buffer buffer)) |
| 278 | (let ((beg (point-min)) (end (point-max))) |
| 279 | (and unfontify (font-lock-unfontify-region beg end)) |
| 280 | (cdr (time-function 'font-lock-fontify-region beg end))))) |
| 281 | |
| 282 | (defun sort-font-lock-fontification (regexps &optional buffer unfontify) |
| 283 | "Return sorted times to fontify syntactically and using REGEXPS. |
| 284 | UNFONTIFY first, if non-nil." |
| 285 | (let ((syntactic-time (time-fontification buffer unfontify)) |
| 286 | (regexp-time (time-regexps regexps buffer))) |
| 287 | (nreverse |
| 288 | (sort (append (list (list syntactic-time 'syntactic) |
| 289 | (list (apply '+ (mapcar 'car regexp-time)) 'regexps)) |
| 290 | regexp-time) |
| 291 | 'car-less-than-car)))) |
| 292 | |
| 293 | ;;; tdtd-make-regexp.el ends here |
Note: See TracBrowser
for help on using the browser.
