source: subversion/sites/namefinder/php/search.php @ 12101

Last change on this file since 12101 was 12101, checked in by david, 11 years ago

add UK postcode prefixes to the database and allow searches qualified by them as if they were places.
Also, suppress leading numbers in search terms on the basis they are probably street addresses.

File size: 21.4 KB
Line 
1<?php
2
3include_once('word.php');
4
5class search {
6
7  /* The common point of call for doing searches. Call search::xmlise(). */
8
9  // --------------------------------------------------
10  /* static */ function xmlise($find, $maxresults, $anyoccurenceifnotlocal) {
11    /* Given a search string, returns a string of complete xml describing
12       the matches found for that string
13
14       find: the search string from the user,
15         search ::- nameword [nameword ...] ( ("near" | ",") placeword [placeword ...]
16             ("," isinword )?  )?
17         nameword ::- ( word | lat "," lon)
18         placeword ::- ( word | lat "," lon)
19         find ::- search ( ":" search )?
20         
21         The colon form allows great circle distance between two results to be calculated
22         Experimentally, the namewords can also form a UK postcode with no qualifications
23         Comments are allowed in the string between square brackets
24
25       maxresults; the maximum numberof results to return in the XML
26
27       returns: a string, formatted as XML
28    */
29   
30    include_once('postcodelookup.php');
31    global $db;
32    include_once('options.php');
33
34    $closedfile = "{$config['installdir']}/closed.xml";
35    if (file_exists($closedfile)) {
36      return file_get_contents($closedfile);
37    }
38
39    /* oxml is the return string. Put admin stuff in the top level element */
40    $oxml = '<' . '?' . 'xml version="1.0" encoding="UTF-8"' . '?'. '>' . "\n";
41    $oxml .= "<searchresults find='" . htmlspecialchars($find, ENT_QUOTES, 'UTF-8') . "'";
42    $indexdate = options::getoption('indexdate');
43    if ($indexdate == '') {
44      $oxml .= " error='index temporarily unavailable'>\n</searchresults>\n";
45      return $oxml;
46    }
47    $oxml .= " sourcedate='".
48      htmlspecialchars($indexdate, ENT_QUOTES, 'UTF-8')."'";
49    $oxml .= " date='".date('Y-m-d H:i:s')."'";
50
51    /* remove comments in square brackets from input - these are comments */
52    $find = trim(preg_replace('/\\[.*\\]/', '', $find));
53
54    /* does it look like a postcode? */
55    $postcodelookup = postcodelookup::postcodelookupfactory($find);
56    $originalpostcode = $find;
57    $isapostcode = $postcodelookup->get_query($find);
58
59    $finds = explode(':', $find);
60    if (count($finds) > 2) { 
61      $oxml .= " error='too many colons'>\n</searchresults>\n";
62      return $oxml;
63    }
64    for($i = 0; $i < count($finds); $i++) { $finds[$i] = trim($finds[$i]); }
65    /* no closing > yet: adding find data to header and possibly errors */
66
67    $oxml .= " distancesearch='" . (count($finds) > 1 ? 'yes' : 'no') . "'";
68
69    $near = NULL; // isset($_GET['near']) ? $_GET['near'] : NULL;
70    $multinameds = array();
71
72    for($i = 0; $i < count($finds); $i++) {
73      /* that is, for each component in a search separated by colons, but often only once... */
74      $thisfind = search::explodeterms($finds[$i]);
75     
76      /* the heart of the search - see below */
77      $nameds = search::find($thisfind, $maxresults, $isapostcode, $anyoccurenceifnotlocal);
78
79      /* reflect the original search data back in the xml */
80      $ks = count($finds) > 1 ? $i+1 : '';
81      if (count($thisfind) >= 1) {
82        $oxml .= " findname{$ks}='" . htmlspecialchars($thisfind[0], ENT_QUOTES, 'UTF-8')."'";
83        if (count($thisfind) >= 2) {
84          $oxml .= " findplace{$ks}='" . htmlspecialchars($thisfind[1], ENT_QUOTES, 'UTF-8')."'";
85          if (count($thisfind) == 3) {
86            $oxml .= " findisin{$ks}='" . htmlspecialchars($thisfind[2], ENT_QUOTES, 'UTF-8')."'";
87          }
88        }
89      }
90
91      /* if find() returned an error message rather than an array of results, try again,
92         dropping any qualifying is_in term, because places often don't include them */
93      if (is_string($nameds) && ! $isapostcode) {
94        if (count($thisfind) == 2) {
95          $thisfind = array_merge(array($thisfind[0]), $thisfind);
96          $nameds = search::find($thisfind, $maxresults, FALSE, $anyoccurenceifnotlocal);
97        }
98      } 
99
100      /* if still nothing, report it */
101      if (is_string($nameds)) {
102        $oxml .= " error='place not found'>\n</searchresults>\n";
103        return $oxml;
104      }
105
106      if (count($nameds) == 0) {
107        if ($isapostcode) {
108          $oxml .= " error='name not found for postcode'>\n</searchresults>\n";
109        } else {
110          $oxml .= " error='name not found'>\n</searchresults>\n";
111        }
112        return $oxml;
113      }
114
115      /* foundnearplace indicates whether the nearest place was the qualifying one or
116         whether there was another place closer */
117      $foundnearplace = ! empty($nameds[0]->place);
118      $oxml .= " foundnearplace{$ks}='" . ($foundnearplace ? 'yes' : 'no') . "'";
119
120      /* reflect any postcode requested in the xml */
121      if ($isapostcode) { $oxml .= " postcode='{$originalpostcode}'"; }
122
123      /* keep a not of the result for debugging */
124      $db->log("result: ".print_r($nameds,1));
125      $multinameds[] = $nameds;
126    }
127
128    $oxml .= ">\n";
129    $xml = '';
130
131    if (count($multinameds) == 1) {
132      /* the usual case */
133      foreach($nameds as $named) { $xml .= $named->xmlise(); }
134    } else {
135      /* the colon case: so now compute the great circle distances for each combination
136         of the fist 3 results in each side of the colon  */
137      include_once('greatcircle.php');
138      $gcs = array();
139      for ($i0 = 0; $i0 < min(count($multinameds[0]), 3); $i0++) {
140        $output0 = FALSE;
141        if ($multinameds[0][$i0]->category != 'place') { break; }
142        for ($i1 = 0; $i1 < min(count($multinameds[1]), 3); $i1++) {
143          if ($multinameds[1][$i1]->category != 'place') { break; }
144          if (! $output0) { 
145            $xml .= $multinameds[0][$i0]->xmlise();
146            $output0 = TRUE;
147          }
148          $xml .= $multinameds[1][$i1]->xmlise();
149          $gcs[] = new greatcircle($multinameds[0][$i0], $multinameds[1][$i1]);
150        }
151      }
152      for ($i0 = 0; $i0 < min(count($multinameds[0]), 3); $i0++) {
153        $output0 = FALSE;
154        if ($multinameds[0][$i0]->category == 'place') { continue; }
155        for ($i1 = 0; $i1 < min(count($multinameds[1]), 3); $i1++) {
156          if ($multinameds[1][$i1]->category == 'place') { continue; }
157          if (! $output0) { 
158            $xml .= $multinameds[0][$i0]->xmlise();
159            $output0 = TRUE;
160          }
161          $xml .= $multinameds[1][$i1]->xmlise();
162          $gcs[] = new greatcircle($multinameds[0][$i0], $multinameds[1][$i1]);
163        }
164      }
165      foreach ($gcs as $gc) { $xml .= $gc->xmlise(); }
166    }
167
168    /* and that's it... */
169    return $oxml . $xml . "</searchresults>\n";
170  }
171
172  // --------------------------------------------------
173  /* static */ function find(&$terms, $maxresults, $doingpostcode, $anyoccurenceifnotlocal=FALSE) {
174    /* Given a search string, returns an array of named's which are the matches
175       for the given search string. Usually this will be called from xmlise rather
176       than directly.
177
178       terms: an array of strings as follows:
179         (1) terms[0]: name of something to look for (including references like road numbers
180         and IATA codes, non-native versions of a name (e.g. Londres), or generic things
181         like "school" or "hotels" (singular or plural)
182
183         or
184
185         (2) terms[0] as above, and
186
187         terms[1]: qualifying place name so that terms[0]
188         must be found close to (an instance of - there may be more
189         than one match) this place.
190
191         or
192
193         (3) terms[0] and terms[1] as above, and
194
195         terms[2]: if given, a further qualifying string, which must appear in
196         the is_in of the qualyfying place. For example, there are
197         multiple Cambridges, so by setting this to UK, the Cambridge
198         with UK in its is_in will be used. (Actually there are two
199         Cambridges in the UK, so Cambridgeshire might be more
200         appropriate in this case)
201 
202         or
203
204         (4) terms[0] and terms[1] are a lat and lon respectively - just asking 'where am I?'
205
206         or
207
208         (5) terms[0] is a name etc. as case 1, and terms[1] and terms[2] are lat and lon
209         respectively restricting the search to near to that location
210
211         Note that postcode searches are converted toname searches
212         before being presented to the find function. find only deals
213         with names and lat/lon pairs
214
215       place names may also be postcode prefixes in the UK (e.g. CB21)
216       or place and postcode (London EC1A), in which case we'll
217       crosscheck.
218
219       maxresults: the maximum number of results to return. 
220
221       returns: an array of named obects - see class named for details
222         or a string which is an error message
223    */
224
225    global $db, $config;
226
227    include_once('canonical.php');
228    include_once('named.php');
229    include_once('region.php');
230
231    /* toofars controls what "nearby" means for a place. For example a
232       hamlet is only "near" somewhere if it is within 8km */
233    $toofars = array(0=>10.0, 
234                     named::placerank('hamlet')=>8.0,
235                     named::placerank('village')=>20.0,
236                     named::placerank('suburb')=>20.0,
237                     named::placerank('airport')=>20.0,
238                     named::placerank('town')=>25.0,
239                     named::placerank('city')=>45.0);
240
241    $places = array(); /* the places, if any, which should qualifty the search */
242    $nameds = array(); /* the result */
243
244    $nterms = count($terms);
245
246    if ($nterms > 2 && search::islatlon($terms[1], $terms[2], $pseudoplace)) {
247      /* case 5 above: reduce the lat/lon to a named, one of the places to qualify
248         the search, and remove them from the list */
249      array_splice($terms, 1, 2);
250      $nterms -=2;
251      $places[] = clone $pseudoplace;
252      $doinglatlonqualifier= TRUE;
253    } else if ($nterms > 1) {
254      if (search::islatlon($terms[0], $terms[1], $pseudoplace)) {
255        /* case 4 above, simply a 'where am i' type of query on
256           lat,lon, so the result is the artifical named for that
257           lat/lon - but of course we need to get its context later,
258           which is the whole point */
259        $terms = array();
260        $pseudoplace->findnearestplace();
261        $pseudoplace->assigndescription($nterms > 1);
262        $nameds[] = clone $pseudoplace; 
263      } else {
264        if (preg_match('/ ([A-Z]{1,2}[0-9]{1,2}[A-Z]?)\s*$/i', " {$terms[1]}", $matches)) {
265          $prefix = $matches[1];
266          include_once('postcodeprefix.php');
267          $postcodeprefix = postcodeprefix::lookup($prefix);
268          if (! empty($postcodeprefix)) {
269            search::islatlon($postcodeprefix->lat, $postcodeprefix->lon, $pseudoplace);
270            $pseudoplace->info = "middle of UK postcode area";
271            $pseudoplace->name = "{$postcodeprefix->prefix} ({$postcodeprefix->placename})";
272            $pseudoplace->rank = 5;
273            $pseudoplace->findnearestplace();
274            $pseudoplace->assigndescription(FALSE);
275            $places[] = clone $pseudoplace; 
276          }
277        }
278
279        if (empty($postcodeprefix)) {
280          /* case 2 or 3 above: search is qualified. Find any places of the name given as
281             the second term  */
282
283          $places = array_merge($places, named::lookupplaces($terms[1], NULL, TRUE));
284          if (count($places) == 0) { return "I can't find {$terms[1]}"; }
285
286          // $db->log ("found places " . print_r($places, 1));
287
288          /* cull the possible places according to given qualifying is_in in case 3*/
289          $placeisin = $nterms > 2 ? array_slice($terms, 2) : array();
290          if (! empty($placeisin)) {
291            foreach($placeisin as $isin) {
292              $isinstrings = explode(' ', canonical::canonicalise_to_string($isin));
293              for ($i = 0; $i < count($places); $i++) {
294                $sourceisinstrings = 
295                  explode(' ', canonical::canonicalise_to_string($places[$i]->is_in));
296                $found = FALSE;
297                foreach($isinstrings as $isin) {
298                  foreach ($sourceisinstrings as $sourceisin) {
299                    if (strpos($sourceisin, $isin) !== FALSE) {
300                      $found = TRUE;
301                      break 2;
302                    }
303                  }
304                }
305                if (! $found) {
306                  array_splice($places, $i, 1);
307                  $i--;
308                }
309              }
310            }
311            // $db->log ("places after cull " . print_r($places, 1));
312          }
313
314          if (count($places) == 0) { 
315            /* nothing left, so say so */
316            $isin = '';
317            $prefix = '';
318            for ($i = 2; $i < count($terms); $i++) { 
319              $isin = "{$prefix}{$terms[$i]}";
320              $prefix = ', ';
321            }
322            $unfoundplace = "{$terms[1]} not found";
323            if (! empty($isin)) { $unfoundplace .= " in {$isin}"; }
324          }
325        }
326      }
327    }
328
329    /* so, we've got so far a list of places,possibly empty, near
330       which we must search (which may have come from a lat/lon or a
331       name), and maybe a result already from a simple lat/lon for
332       which we only require context */
333
334
335    /* special cases for place like things: limit search
336       only to places (rank > 0) rather than including streets named
337       'Somewhere Place' etc */
338
339    switch ($terms[0]) {
340    case 'cities':
341      $terms[0] = 'city';
342      $placesonly = y_op::eq(y_op::field('rank',0), named::placerank('city'));
343      break;
344    case 'towns':
345      $placesonly = y_op::eq(y_op::field('rank',0), named::placerank('town'));
346      break;
347    case 'suburbs':
348      $placesonly = y_op::eq(y_op::field('rank',0), named::placerank('suburb'));
349      break;
350    case 'villages':
351      $placesonly = y_op::eq(y_op::field('rank',0), named::placerank('village'));
352      break;
353    case 'hamlets':
354      $placesonly = y_op::eq(y_op::field('rank',0), named::placerank('hamlet'));
355      break;
356    case 'places':
357      $placesonly = y_op::gt(y_op::field('rank',0), 0);
358      break;
359    }
360
361    /* Work out canonical forms of the first search term (the road name or whatever) to
362       try matching against equivalents in the database. There's more than one because
363       Hinton Road becomes Hinton Rd as well, and so on */
364
365    $canonterms = canonical::canonical_basic($terms[0]);
366    if (count($canonterms) > 4) { array_splice($canonterms, 4); }
367
368    if (count($canonterms) > 1 && count($canonterms[0]) == 1 &&
369        preg_match('/^[1-9][0-9]*$/', $canonterms[0][0]))
370    { 
371      /* remove numbers at the beginning on the basis someone probably
372       typed a street address, such as "31 Hinton Road" */
373      array_splice($canonterms,0,1); 
374    }
375
376    if (count($places) > 0) {
377      /* There are qualifying places.
378
379         SELECT * FROM named WHERE (region=n0 [or region=n1 or ...])
380         ORDER BY ((lat - latplace)^2 + (lon - lonplace)^2 asc  */
381
382      foreach ($places as $place) {
383        $place->assigncontext(); // nearest more important place(s)
384
385        /* find occurences of the name ordered by distance from the place,
386           for each of the places we found */
387        $region = new region($place->lat, $place->lon);
388        $regionnumbers = $region->considerregions();
389
390        $q = $db->query();
391        if (! isset($placesonly)) {
392          $q->where(word::whereword($joiners, $canonterms, FALSE, $regionnumbers));
393        } else {
394          $joiners = array(new placeindex(), new named());
395          $ors = array();
396          foreach ($regionnumbers as $regionnumber) { 
397            $ors[] = y_op::eq(y_op::field('region',0), $regionnumber); 
398          }
399          $ands = array($placesonly,
400                        count($ors) == 1 ? $ors[0] : y_op::oor($ors),
401                        y_op::feq(y_op::field('id',0),y_op::field('id',1)));
402          $q->where(y_op::aand($ands));
403        }
404        $q->ascending(canonical::distancerestriction($place->lat, $place->lon, count($joiners)-1));
405        $q->limit($maxresults);
406        // $q->groupby(y_op::field('id',count($joiners)-1));
407
408        $toofar = empty($toofars[$place->rank]) ? $toofars[0]: $toofars[$place->rank];
409        while ($q->select($joiners) > 0) { 
410          $named = $joiners[count($joiners) - 1];
411          $named->place = clone $place;
412          $named->place->localdistancefrom($named);
413
414          if ($named->place->distance > $toofar) { break; } // everywhere else is further too
415
416          unset($named->placenearer);
417          $named->findnearestplace(/* other than... */ $place);
418          if (! empty($named->placenearer) && 
419              $named->place->distance < $named->placenearer->distance) 
420          {
421            unset($named->placenearer);
422          }
423          $named->assigndescription($nterms > 1);
424          $nameds[] = clone $named; 
425        }
426
427        // $db->log ("found names near those places " . print_r($nameds, 1));       
428      }
429    }
430
431    if (count($nameds) == 0 && (count($places) == 0 || $anyoccurenceifnotlocal) && 
432        empty($doinglatlonqualifier) && empty($doingpostcode)) 
433    {
434      /* Either no qualifying place, or no name found near given place
435         (and we asked to search more widely). If there was a
436         qualifying place try general search for name anyway: "but I
437         did find one near..."
438
439         In this case we have no place to order by distance from, so
440         instead do a partial ordering so that exact matches on the
441         name (or one of its abbreviated variants) come first and then
442         partial matches. For example, "Fulbourn" would come before
443         "Fulbourn Post Office" when searching for "Fulbourn". We do
444         this by going round the loop twice, relaxing the exactness
445         condition on the second time round
446
447         ... Well, that's what I used to do. In the interests of
448         eifficiency, however, for now just do inexact matches. We'll
449         still get places first, but a search for Bury will include
450         Bury St Edmunds whereas before that would have been well down
451         the list, after all the other Burys */
452      $limit = $maxresults;
453      $exact = FALSE; // TRUE;
454      for ($i = 0; $i < 1 /* 2 */ && $limit > 0; $i++) {
455        $q = $db->query();
456        $q->where(word::whereword($joiners, $canonterms, $exact));
457        $q->limit($limit);
458        // $condition = y_op::aand($condition, y_op::le('rank',named::placerank('city')));
459        /* prioritise places, and those in order of importance, cities first */
460        $q->descending('rank');
461        // $q->groupby(y_op::field('id',count($joiners)-1));
462
463        while ($q->select($joiners) > 0) { 
464          $db->log(print_r($joiners,1));
465          $db->log(print_r($named,1));
466          $named = $joiners[count($joiners) - 1];
467          $namedclone = clone $named;
468          if ($namedclone->rank > 0) {
469            $namedclone->assigncontext();
470            $namedclone->findnearestplace($namedclone, $namedclone->isolatedplaceneighbourranks());
471          } else {
472            $namedclone->findnearestplace();
473          }
474          if (isset($unfoundplace)) { $namedclone->place = $unfoundplace; }
475          $namedclone->assigndescription($nterms > 1);
476          $nameds[] = $namedclone;
477        }
478        $limit -= count($nameds);
479        $exact = FALSE;
480      }
481
482      // $db->log ("found names near other places " . print_r($nameds, 1));       
483    }
484
485    /* cull duplicate responses. These are usually because it found the name near more than
486       one place which matched the place name criterion */
487    $namedsunique = array();
488    foreach ($nameds as $named) {
489      if (! array_key_exists( $named->id, $namedsunique)) {
490        $namedsunique[$named->id] = $named;
491      }
492    }
493
494    return array_values($namedsunique);
495  }
496
497  // --------------------------------------------------
498  /* static */ function explodeterms($terms) {
499    /* Helper function to expand a complete search string into an array of terms
500       suitable for the find function above */
501    $terms = str_replace(' near ',',',$terms);
502    $terms = explode(',', $terms);
503    for ($i = 0; $i < count($terms); $i++) { $terms[$i] = trim($terms[$i]); }
504    return $terms;
505  }
506
507  // --------------------------------------------------
508  /* private static */ function islatlon($term1, $term2, &$pseudoplace) {
509    /* Returns a boolean according to whether term1 and term2 (both
510       strings, separate because the comma betwen them caused them to
511       be separated), are both decimal numbers, and therefore together
512       form a latituelongitude pair. If s, constructs and returns in
513       $pseudoplace an anonymous, artificial named which is located
514       atthe lat/lon determined */
515    static $anonid = 0;
516    if (preg_match('/^-?([0-9]+|[0-9]*\\.[0-9]+)$/', $term1) &&
517        preg_match('/^-?([0-9]+|[0-9]*\\.[0-9]+)$/', $term2)) {
518      $lat = (double)$term1;
519      $lon = (double)$term2;
520      $pseudoplace = new named();
521      $pseudoplace->category = 'place';
522      $pseudoplace->lat = $lat;
523      $pseudoplace->lon = $lon;
524      $pseudoplace->name = '';
525      $anonid -= 10;
526      $pseudoplace->id = $anonid; 
527      $pseudoplace->rank = named::placerank('city'); // hmm
528      $pseudoplace->info = 'requested location';
529      return TRUE;
530    }
531    return FALSE;
532  }
533
534}
535
536?>
Note: See TracBrowser for help on using the repository browser.