Ticket #2390: canonical.php

File canonical.php, 10.8 KB (added by woidrick, 10 years ago)

hack, sorry, cant use diff

Line 
1<?php
2
3/* This class represents elements of the table which matches indexed
4   alternate canonical forms of search terms and also
5   provides services to canonicalise and generalise
6   search terms */
7
8class canonical {
9
10  var $canonical;  /* a canonical UTF-8 string, that is punctuation
11                      removed, no spaces, diacriticals and ligatures
12                      reduced to ascii equivalents */
13  var $region;     /* the region number for this canonical string */
14
15  // --------------------------------------------------
16  /* constructor */ function canonical($canonical=NULL, $region=NULL) {
17    if (! is_null($canonical)) { $this->canonical = $canonical; }
18    if (! is_null($region)) { $this->region = $region; }
19  }
20
21  // --------------------------------------------------
22  /* static */ function canonical_basic($term, $alternates=FALSE) {
23
24    /* produces an array of "words" where each word is itself an array
25       of alternate forms of that word according to replacements for
26       utf8 diacriticals and the like. A word is canonical - that is
27       uses a single form of diacritical and ligature equivalents,
28       lower case, punctuation free and so on so that when a similar
29       name is converted on search, it can also be converted to
30       canonical form to compare with the canonical index.
31
32       term: a name being indexed or sought, for example 'Ben Nevis'
33         or 'Newmarket Road [A1304]'
34
35       alternates: whether to consider  replacing a diacritical
36         character with multiple alternate
37         replacements, for example Danish aring character with aa and a
38         (we normally only do this on indexing, not on lookup).
39
40       The canonical form also splits each word nto an array of two words
41       at a point between a separalble suffix. For example
42       'hauptbahnweg' -> array('hauptbahn','weg') */
43
44    /* The replacements table maps UTF characters (multiple byte keys) to ascii equivalents
45       so that characters such as u-umlaut can be matched by u and ue. There are multiple
46       tables because some characters have more than one functional replacement
47       (as for u-umlaut). We store multiple canonical forms in the word index,
48       but search only on one (so M<u-umlaut>nchen is stored as 'munchen' and 'muenchen'.
49
50       see http://www.utf8-chartable.de/unicode-utf8-table.pl for a utf-8 list
51    */
52
53    static $replacements = NULL;
54    if (empty($replacements)) { $replacements = include_once('utf8.inc.php'); }
55
56    static $suffixes = NULL;
57    if (empty($suffixes)) { $suffixes = include_once('suffixes.inc.php'); }
58
59    static $languagecodes = NULL;
60    if (empty($languagecodes)) { $languagecodes = include_once('languagecodes.inc.php'); }
61
62    /* separate the search terms into words */
63    $term = preg_replace('~[ \\-\\/\\:\\;\\=\\|\\,]+~', ' ', 
64                         preg_replace('/\\&/', ' and ', $term));
65    $term = trim(preg_replace('/ (the|der|das|die|le|la|el|il) /i', ' ', " {$term} "));
66    $words = explode(' ', $term);
67    $prefix = '';
68
69    $na = $alternates ? count($replacements) : 1;
70
71    $canonicalvariants = array();
72
73    foreach ($words as $word) {
74      $word = trim(strtolower($word));
75      /* remove apostrophe-s: these are always stored and searched in the singular
76         non-possessive so that (the church of, for example) 'St Andrew's',
77         'St Andrews' and 'St Andrew' all match equivalently */
78      $pos = mb_strpos($word, "\xe2\x80\x99s", 0, 'UTF-8');
79      if ($pos !== FALSE) {
80        $apostrophe = mb_strlen($word, 'UTF-8') == $pos + 2;
81        if ($apostrophe) { $word = mb_substr($word, 0, -2, 'UTF-8'); }
82      } else {
83        $apostrophe = strlen($word) > 1 && substr($word, -2) == '\'s';
84        if ($apostrophe) { $word = substr($word, 0, -2); }
85      }
86      if (empty($word)) { continue; }
87
88      if ($word{0} == '[') {
89        /* separate clauses within the phrase with |,
90           and remove any language codes following, as in [de ... ] */
91        $canonicalvariants[] = '|';
92        $word = substr($word, 1);
93        if (empty($word)) { continue; }
94        if ($alternates && array_key_exists($word, $languagecodes)) { continue; }
95      }
96
97      $l = mb_strlen($word, 'UTF-8');
98      $s = '';
99      $letters = array();
100      for ($i = 0; $i < $l; $i++) {
101        /* replace listed UTF-8 characters with their ascii
102           equivalents. For search words we only replace from the main
103           replacement table (hence $na = 1), but so that we get
104           alternatives to search for, we replace from all the tables
105           in turn (falling back to the main table if not in the
106           alternates */
107        $c = mb_substr($word, $i, 1, 'UTF-8');
108        /* try each replacements table */
109        for($alt = 0; $alt < $na; $alt++) {
110          $replacement =& $replacements[$alt];
111          if (array_key_exists($c, $replacement)) {
112            $letters[$i][] = $replacement[$c];
113          } else if ($alt == 0) {
114            $letters[$i][] = $c;
115          }
116        }
117      }
118
119      /* so now we have an array of each letter in its several alternate forms.
120         Build an array of alternate combinations */
121
122      $wordvariants = array('');
123      foreach ($letters as $lettervariants) {
124        $newwordvariants = array();
125        foreach ($wordvariants as $variant) {
126          foreach ($lettervariants as $lettercombination) {
127            $newwordvariants[] = $variant . $lettercombination;
128          }
129        }
130        $wordvariants = $newwordvariants;
131      }
132     
133      /* now add the plural form if apostrophe s present */
134      if ($apostrophe && $alternates) {
135        $nt = count($wordvariants);
136        for ($t = 0; $t < $nt; $t++) {
137          $wordvariants[] = "{$wordvariants[$t]}s";
138        }
139      }
140
141      /* $wordvariants now contains an array of possible variants on
142         the word under consideration. If this ends in a concatenated
143         suffix like 'strasse', split it - e.g. 'hauptbahnstrasse' ->
144         'hauptbahn strasse' - so we'll always look for, abbreviate and
145         deabbrevaite, and find, the separated variety */
146
147      $nt = count($wordvariants);
148      for ($t = 0; $t < $nt; $t++) {
149        $word = $wordvariants[$t];
150        foreach ($suffixes as $suffix) {
151          $ns = strlen($suffix);
152          if (strlen($word) > $ns && substr($word, - $ns) == $suffix) {
153            $wordvariants[$t] = substr($word, 0, - $ns);
154            $addsuffix = $suffix;
155            break;
156          }
157        }
158      }
159
160      $canonicalvariants[] = $wordvariants;
161      if (isset($addsuffix)) { 
162        $canonicalvariants[] = array($addsuffix);
163        unset($addsuffix);
164      }
165    }
166
167    /* now we have an array $canonicalvariants each element of which is an
168       aray of alternate possibilities for that original word. We have
169       at minumum got rid of any ligatures, diacriticals etc, so we
170       don't have to worry about alternatives to strasse and the like
171       any more */
172
173    return $canonicalvariants;
174  }
175
176  // --------------------------------------------------
177  /* static */ function canonical_with_synonym($term) {
178    /* expand the array of variants to include cononicalise
179       the term as above, but also create multiple
180       canonical strings where each has a variation in common
181       abbreviations (road for rd etc, and vice-versa, and singnular
182       for plural - that's particularly important for church names and
183       similar, where we want to match "St John's" with "St John" or
184       "St Johns" (simple canonicalisation will have removed the
185       apostrophe, so the plural to singular also acts as possessive
186       to non-possessive */
187
188    $canonicalvariants = canonical::canonical_basic($term, TRUE);
189    if (empty($canonicalvariants)) { return $canonicalvariants; }
190
191    static $synonyms = NULL;
192    if (is_null($synonyms)) { $synonyms = include_once('synonyms.inc.php'); }
193
194    $nc = count($canonicalvariants);
195    for($i = 0; $i < $nc; $i++) {
196      $nt = count($canonicalvariants[$i]);
197      for($j = 0; $j < $nt; $j++) {
198        $wordvariant = $canonicalvariants[$i][$j];
199        if (! empty($synonyms[$wordvariant])) {
200          if (is_array($synonyms[$wordvariant])) {
201            /* special case 'dr'=>'drive' and 'st' => 'street' only when last word,
202               otherwise we get lots of unnecessary 'doctor's and 'saint's */
203            if (($wordvariant == 'dr' || $wordvariant == 'st') && $i == $nc-1) {
204              $canonicalvariants[$i][] = $synonyms[$wordvariant][0];
205            } else {
206              $canonicalvariants[$i] = array_merge($canonicalvariants[$i], 
207                                                   $synonyms[$wordvariant]);
208            }
209          } else {
210            $canonicalvariants[$i][] = $synonyms[$wordvariant];
211          }
212        } 
213      }
214    }
215
216    return $canonicalvariants;
217  }
218
219  // --------------------------------------------------
220  /* static */ function canonicalise_to_string($canoncalise) {
221    $canonicalterms = canonical::canonical_basic($canoncalise);
222    $s = '';
223    $prefix = '';
224    foreach ($canonicalterms as $term) {
225      if (is_array($term)) {
226        if (empty($term[0])) { continue; }
227        $s = "{$s}{$prefix}{$term[0]}";
228      } else {
229        $s = "{$s}{$prefix}{$term}";
230      }
231      $prefix = ' ';
232    }
233    return $s;
234  }
235
236  // --------------------------------------------------
237  /* static */ function distancerestriction($lat, $lon, $fi) {
238    /* This generates a SQL fragment for ORDER BY so that names come back sorted by distance
239       from given latitude and longitude */
240    return y_op::oprintf("(pow(%f - {$lat},2) + pow(%f - {$lon},2))", 
241                         y_op::field('lat', $fi),
242                         y_op::field('lon', $fi));
243  }
244
245
246  // --------------------------------------------------
247  /* static */ function getuniqueid($osmid, $type) {
248    /* osm ids are only unique within type (node, way, relation), so we make them unique
249       overall by inserting in the osm id an extra loworder decimal digit for the type */
250    static $types;
251    if (! isset($types)) { $types = array_flip(canonical::getosmtypes()); }
252    return ($osmid << 2) | (is_int($type) ? $type : $types[$type]);
253  }
254
255  // --------------------------------------------------
256  /* static */ function getosmid($id, &$type) {
257    /* converts from name finder id to osm id; the converse of getuniqueid above */
258    static $types;
259    if (! isset($types)) { $types = canonical::getosmtypes(); }
260    $typeindex = $id & 0x3;
261    $type = $types[$typeindex];
262    return $id >> 2;
263  }
264
265  // --------------------------------------------------
266  /* static */ function getosmtypes() {
267    static $types = array(1=>'node',2=>'way',3=>'relation',4=>'changeset');
268    return $types;
269  }
270 
271}
272
273?>