source: subversion/sites/namefinder/php/search.php @ 17445

Last change on this file since 17445 was 12251, checked in by david, 11 years ago

Use contextual search ("Huntingdon, England") even if there is a place with the same name as the context ("England" - which there is!). Also, when this happens, describe the context relative to neighbouring places.

File size: 24.0 KB
Line 
1<?php
2
3include_once('word.php');
4
5class search {
6
7  /* The common point of call for doing searches. Call search::xmlise(). */
8
9  // --------------------------------------------------
10  /* static */ function xmlise($find, $maxresults, $anyoccurenceifnotlocal) {
11    /* Given a search string, returns a string of complete xml describing
12       the matches found for that string
13
14       find: the search string from the user,
15         search ::- nameword [nameword ...] ( ("near" | ",") placeword [placeword ...]
16             ("," isinword )?  )?
17         nameword ::- ( word | lat "," lon)
18         placeword ::- ( word | lat "," lon)
19         find ::- search ( ":" search )?
20         
21         The colon form allows great circle distance between two results to be calculated
22         Experimentally, the namewords can also form a UK postcode with no qualifications
23         Comments are allowed in the string between square brackets
24
25       maxresults; the maximum numberof results to return in the XML
26
27       returns: a string, formatted as XML
28    */
29   
30    include_once('postcodelookup.php');
31    global $db;
32    include_once('options.php');
33
34    $closedfile = "{$config['installdir']}/closed.xml";
35    if (file_exists($closedfile)) {
36      return file_get_contents($closedfile);
37    }
38
39    /* oxml is the return string. Put admin stuff in the top level element */
40    $oxml = '<' . '?' . 'xml version="1.0" encoding="UTF-8"' . '?'. '>' . "\n";
41    $oxml .= "<searchresults find='" . htmlspecialchars($find, ENT_QUOTES, 'UTF-8') . "'";
42    $indexdate = options::getoption('indexdate');
43    if ($indexdate == '') {
44      $oxml .= " error='index temporarily unavailable'>\n</searchresults>\n";
45      return $oxml;
46    }
47    $oxml .= " sourcedate='".
48      htmlspecialchars($indexdate, ENT_QUOTES, 'UTF-8')."'";
49    $oxml .= " date='".date('Y-m-d H:i:s')."'";
50
51    /* make a note of the query */
52    include_once('querylog.php');
53    querylog::log($find);
54
55    /* remove comments in square brackets from input - these are comments */
56    $find = trim(preg_replace('/\\[.*\\]/', '', $find));
57
58    /* is there actually something to search on? */ 
59    if ($find == '') { 
60      $oxml .= " error='nothing given to search for'>\n</searchresults>\n"; 
61      return $oxml; 
62    } 
63
64    /* does it look like a postcode? */
65    $postcodelookup = postcodelookup::postcodelookupfactory($find);
66    if (! empty($postcodelookup)) {
67      $find = $postcodelookup->namefinderquery;
68      if (! $postcodelookup->prefixonly) { $maxresults = 1; }
69    }
70
71    $finds = explode(':', $find);
72    if (count($finds) > 2) { 
73      $oxml .= " error='too many colons'>\n</searchresults>\n";
74      return $oxml;
75    }
76    for($i = 0; $i < count($finds); $i++) { $finds[$i] = trim($finds[$i]); }
77    /* no closing > yet: adding find data to header and possibly errors */
78
79    $oxml .= " distancesearch='" . (count($finds) > 1 ? 'yes' : 'no') . "'";
80
81    $near = NULL; // isset($_GET['near']) ? $_GET['near'] : NULL;
82    $multinameds = array();
83
84    for($i = 0; $i < count($finds); $i++) {
85      /* that is, for each component in a search separated by colons, but often only once... */
86      $thisfind = search::explodeterms($finds[$i]);
87     
88      /* the heart of the search - see below */
89      $nameds = search::find($thisfind, $maxresults, $postcodelookup, $anyoccurenceifnotlocal);
90
91      /* reflect the original search data back in the xml */
92      $ks = count($finds) > 1 ? $i+1 : '';
93      if (count($thisfind) >= 1) {
94        $oxml .= " findname{$ks}='" . htmlspecialchars($thisfind[0], ENT_QUOTES, 'UTF-8')."'";
95        if (count($thisfind) >= 2) {
96          $oxml .= " findplace{$ks}='" . htmlspecialchars($thisfind[1], ENT_QUOTES, 'UTF-8')."'";
97          if (count($thisfind) == 3) {
98            $oxml .= " findisin{$ks}='" . htmlspecialchars($thisfind[2], ENT_QUOTES, 'UTF-8')."'";
99          }
100        }
101      }
102
103      /* if find() returned an error message rather than an array of results, try again,
104         dropping any qualifying is_in term, because places often don't include them */
105      if (is_string($nameds) && empty($postcodelookup)) {
106        if (count($thisfind) == 2) {
107          $thisfind = array_merge(array($thisfind[0]), $thisfind);
108          $nameds = search::find($thisfind, $maxresults, FALSE, $anyoccurenceifnotlocal, TRUE);
109        }
110      } 
111
112      /* if still nothing, report it */
113      if (is_string($nameds)) {
114        $oxml .= " error='place not found'>\n</searchresults>\n";
115        return $oxml;
116      }
117
118      if (count($nameds) == 0) {
119        if (! empty($postcodelookup)) {
120          $oxml .= " error='name not found for postcode'>\n</searchresults>\n";
121        } else {
122          $oxml .= " error='name not found'>\n</searchresults>\n";
123        }
124        return $oxml;
125      }
126
127      /* foundnearplace indicates whether the nearest place was the qualifying one or
128         whether there was another place closer */
129      $foundnearplace = ! empty($nameds[0]->place);
130      $oxml .= " foundnearplace{$ks}='" . ($foundnearplace ? 'yes' : 'no') . "'";
131
132      /* reflect any postcode requested in the xml */
133      if (! empty($postcodelookup)) { $oxml .= " postcode='{$postcodelookup->postcode}'"; }
134
135      /* keep a not of the result for debugging */
136      $db->log("result: ".print_r($nameds,1));
137      $multinameds[] = $nameds;
138    }
139
140    $oxml .= ">\n";
141    $xml = '';
142
143    if (count($multinameds) == 1) {
144      /* the usual case */
145      foreach($nameds as $named) { $xml .= $named->xmlise(); }
146    } else {
147      /* the colon case: so now compute the great circle distances for each combination
148         of the fist 3 results in each side of the colon  */
149      include_once('greatcircle.php');
150      $gcs = array();
151      for ($i0 = 0; $i0 < min(count($multinameds[0]), 3); $i0++) {
152        $output0 = FALSE;
153        if ($multinameds[0][$i0]->category != 'place') { break; }
154        for ($i1 = 0; $i1 < min(count($multinameds[1]), 3); $i1++) {
155          if ($multinameds[1][$i1]->category != 'place') { break; }
156          if (! $output0) { 
157            $xml .= $multinameds[0][$i0]->xmlise();
158            $output0 = TRUE;
159          }
160          $xml .= $multinameds[1][$i1]->xmlise();
161          $gcs[] = new greatcircle($multinameds[0][$i0], $multinameds[1][$i1]);
162        }
163      }
164      for ($i0 = 0; $i0 < min(count($multinameds[0]), 3); $i0++) {
165        $output0 = FALSE;
166        if ($multinameds[0][$i0]->category == 'place') { continue; }
167        for ($i1 = 0; $i1 < min(count($multinameds[1]), 3); $i1++) {
168          if ($multinameds[1][$i1]->category == 'place') { continue; }
169          if (! $output0) { 
170            $xml .= $multinameds[0][$i0]->xmlise();
171            $output0 = TRUE;
172          }
173          $xml .= $multinameds[1][$i1]->xmlise();
174          $gcs[] = new greatcircle($multinameds[0][$i0], $multinameds[1][$i1]);
175        }
176      }
177      foreach ($gcs as $gc) { $xml .= $gc->xmlise(); }
178    }
179
180    /* and that's it... */
181    return $oxml . $xml . "</searchresults>\n";
182  }
183
184  // --------------------------------------------------
185  /* static */ function find(&$terms, $maxresults, $postcodelookup, 
186                             $anyoccurenceifnotlocal=FALSE, $recursive=FALSE) 
187  {
188    /* Given a search string, returns an array of named's which are the matches
189       for the given search string. Usually this will be called from xmlise rather
190       than directly.
191
192       terms: an array of strings as follows:
193         (1) terms[0]: name of something to look for (including references like road numbers
194         and IATA codes, non-native versions of a name (e.g. Londres), or generic things
195         like "school" or "hotels" (singular or plural)
196
197         or
198
199         (2) terms[0] as above, and
200
201         terms[1]: qualifying place name so that terms[0]
202         must be found close to (an instance of - there may be more
203         than one match) this place.
204
205         or
206
207         (3) terms[0] and terms[1] as above, and
208
209         terms[2]: if given, a further qualifying string, which must appear in
210         the is_in of the qualyfying place. For example, there are
211         multiple Cambridges, so by setting this to UK, the Cambridge
212         with UK in its is_in will be used. (Actually there are two
213         Cambridges in the UK, so Cambridgeshire might be more
214         appropriate in this case)
215 
216         or
217
218         (4) terms[0] and terms[1] are a lat and lon respectively - just asking 'where am I?'
219
220         or
221
222         (5) terms[0] is a name etc. as case 1, and terms[1] and terms[2] are lat and lon
223         respectively restricting the search to near to that location
224
225         Note that postcode searches are converted toname searches
226         before being presented to the find function. find only deals
227         with names and lat/lon pairs
228
229       place names may also be postcode prefixes in the UK (e.g. CB21)
230       or place and postcode (London EC1A), in which case we'll
231       crosscheck.
232
233       maxresults: the maximum number of results to return. 
234
235       returns: an array of named obects - see class named for details
236         or a string which is an error message
237    */
238
239    global $db, $config;
240
241    include_once('canonical.php');
242    include_once('named.php');
243    include_once('region.php');
244
245    /* toofars controls what "nearby" means for a place. For example a
246       hamlet is only "near" somewhere if it is within 8km */
247    $toofars = array(0=>10.0, 
248                     named::placerank('hamlet')=>8.0,
249                     named::placerank('village')=>20.0,
250                     named::placerank('suburb')=>20.0,
251                     named::placerank('airport')=>20.0,
252                     named::placerank('town')=>25.0,
253                     named::placerank('city')=>45.0);
254
255    $places = array(); /* the places, if any, which should qualifty the search */
256    $nameds = array(); /* the result */
257
258    $nterms = count($terms);
259
260    if ($nterms > 2 && search::islatlon($terms[1], $terms[2], $pseudoplace)) {
261      /* case 5 above: reduce the lat/lon to a named, one of the places to qualify
262         the search, and remove them from the list */
263      array_splice($terms, 1, 2);
264      $nterms -=2;
265      $places[] = clone $pseudoplace;
266      $doinglatlonqualifier= TRUE;
267    } else if ($nterms > 1) {
268      if (search::islatlon($terms[0], $terms[1], $pseudoplace)) {
269        /* case 4 above, simply a 'where am i' type of query on
270           lat,lon, so the result is the artifical named for that
271           lat/lon - but of course we need to get its context later,
272           which is the whole point */
273        $terms = array();
274        $pseudoplace->findnearestplace();
275        $pseudoplace->assigndescription($nterms > 1);
276        $nameds[] = clone $pseudoplace; 
277      } else {
278
279        /* case 2 or 3 above: search is qualified. Find any places of the name given as
280         the second term  */
281
282        /* allow for the place being qualified by a postcode either with or
283           without a comma separator (and also below, and arbitrary is_in term) */
284        $postcodeappendage = postcodelookup::postcodelookupfactory($terms[1], TRUE);
285        if (! empty($postcodeappendage)) {
286          if (empty($postcodeappendage->textbefore)) {
287            /* the place term is only a postcode - use a pseudo place instead */
288            $places = array(named::pseudonamedfrompostcode($postcodeappendage));
289            unset($postcodeappendage);
290          } else {
291            /* otherwise it stands as an is_in term even though there's no comma */
292            $places = array_merge($places, named::lookupplaces($postcodeappendage->textbefore, 
293                                                               NULL, TRUE));
294          }
295        } else {
296          if (! empty($terms[2])) {
297            $postcodeappendage = postcodelookup::postcodelookupfactory($terms[2]);
298            /* which, if set, is like an is_in qualifier */
299          }
300          $places = array_merge($places, named::lookupplaces($terms[1], NULL, TRUE));
301        }
302
303        if (count($places) == 0) { return "I can't find {$terms[1]}"; }
304
305        // $db->log ("found places " . print_r($places, 1));
306
307        /* cull the possible places according to given qualifying is_in in case 3, or
308           by a postcode or postcode area */
309        if (empty($postcodeappendage)) {
310          $placeisin = $nterms > 2 ? array_slice($terms, 2) : array();
311          if (! empty($placeisin)) {
312            foreach($placeisin as $isin) {
313              $isinstrings = explode(' ', canonical::canonicalise_to_string($isin));
314              for ($i = 0; $i < count($places); $i++) {
315                $sourceisinstrings = 
316                  explode(' ', canonical::canonicalise_to_string($places[$i]->is_in));
317                $found = FALSE;
318                foreach($isinstrings as $isin) {
319                  foreach ($sourceisinstrings as $sourceisin) {
320                    if (strpos($sourceisin, $isin) !== FALSE) {
321                      $found = TRUE;
322                      break 2;
323                    }
324                  }
325                }
326                if (! $found) {
327                  array_splice($places, $i, 1);
328                  $i--;
329                }
330              }
331            }
332            // $db->log ("places after cull " . print_r($places, 1));
333          }
334        } else {
335          /* cull the places to be within a reasonable distance of the postcode prefix centroid */
336          include_once('placeindex.php');
337          $postcodeplace = named::pseudonamedfrompostcode($postcodeappendage);
338          include_once('region.php');
339          $region = new region($postcodeplace->lat, $postcodeplace->lon);
340          $considerregions = $region->considerregions();
341          for($i = 0; $i < count($places) /* which varies! */; $i++) {
342            if (in_array($places[$i]->region, $considerregions)) {
343              /* the biggest postcode area (in Caithness) is approx 120km in diameter,
344               so we need to be within 60km or so of the place for it to qualify */ 
345              $tempnamed = clone $places[$i];
346              if ($tempnamed->localdistancefrom($postcodeplace) < 60.0) { continue; }
347            }
348            array_splice($places, $i, 1);
349            $i--; // because we'll increase it again in the for
350          }         
351        }
352
353        if (count($places) == 0) { 
354          /* nothing left, so say so */
355          $isin = '';
356          $prefix = '';
357          for ($i = 2; $i < count($terms); $i++) { 
358            $isin = "{$prefix}{$terms[$i]}";
359            $prefix = ', ';
360          }
361          $unfoundplace = "{$terms[1]} not found";
362          if (! empty($isin)) { $unfoundplace .= " in {$isin}"; }
363        }
364      }
365    }
366
367    /* so, we've got so far a list of places,possibly empty, near
368       which we must search (which may have come from a lat/lon or a
369       name), and maybe a result already from a simple lat/lon for
370       which we only require context */
371
372
373    /* special cases for place like things: limit search
374       only to places (rank > 0) rather than including streets named
375       'Somewhere Place' etc */
376
377    switch ($terms[0]) {
378    case 'cities':
379      $terms[0] = 'city';
380      $placesonly = y_op::eq(y_op::field('rank',0), named::placerank('city'));
381      break;
382    case 'towns':
383      $placesonly = y_op::eq(y_op::field('rank',0), named::placerank('town'));
384      break;
385    case 'suburbs':
386      $placesonly = y_op::eq(y_op::field('rank',0), named::placerank('suburb'));
387      break;
388    case 'villages':
389      $placesonly = y_op::eq(y_op::field('rank',0), named::placerank('village'));
390      break;
391    case 'hamlets':
392      $placesonly = y_op::eq(y_op::field('rank',0), named::placerank('hamlet'));
393      break;
394    case 'places':
395      $placesonly = y_op::gt(y_op::field('rank',0), 0);
396      break;
397    }
398
399    /* Work out canonical forms of the first search term (the road name or whatever) to
400       try matching against equivalents in the database. There's more than one because
401       Hinton Road becomes Hinton Rd as well, and so on */
402
403    $canonterms = canonical::canonical_basic($terms[0]);
404    if (count($canonterms) > 4) { array_splice($canonterms, 4); }
405
406    $ctn = count($canonterms)-1;
407    if ($ctn > 0) {
408      if (count($canonterms[0]) == 1 && preg_match('/^[1-9][0-9]*$/', $canonterms[0][0])) {
409        /* remove numbers at the beginning on the basis someone probably
410         typed a street address, such as "31 Hinton Road" */
411        array_splice($canonterms,0,1); 
412      } else if (count($canonterms[$ctn]) == 1 && 
413                 preg_match('/^[1-9][0-9]*$/', $canonterms[$ctn][0])) 
414      {
415        /* ditto European style addresses with the number at the end, as in "Via Meloria 14" */
416        array_splice($canonterms, $ctn, 1); 
417      }
418    }
419
420    if (count($places) > 0) {
421      /* There are qualifying places.
422
423         SELECT * FROM named WHERE (region=n0 [or region=n1 or ...])
424         ORDER BY ((lat - latplace)^2 + (lon - lonplace)^2 asc  */
425
426      foreach ($places as $place) {
427        $place->assigncontext(); // nearest more important place(s)
428        /* find occurences of the name ordered by distance from the place,
429           for each of the places we found */
430        $region = new region($place->lat, $place->lon);
431        $regionnumbers = $region->considerregions();
432
433        $q = $db->query();
434        if (! isset($placesonly)) {
435          $q->where(word::whereword($joiners, $canonterms, FALSE, $regionnumbers));
436        } else {
437          $joiners = array(new placeindex(), new named());
438          $ors = array();
439          foreach ($regionnumbers as $regionnumber) { 
440            $ors[] = y_op::eq(y_op::field('region',0), $regionnumber); 
441          }
442          $ands = array($placesonly,
443                        count($ors) == 1 ? $ors[0] : y_op::oor($ors),
444                        y_op::feq(y_op::field('id',0),y_op::field('id',1)));
445          $q->where(y_op::aand($ands));
446        }
447        $q->ascending(canonical::distancerestriction($place->lat, $place->lon, count($joiners)-1));
448        $q->limit($maxresults);
449        // $q->groupby(y_op::field('id',count($joiners)-1));
450
451        $toofar = empty($toofars[$place->rank]) ? $toofars[0]: $toofars[$place->rank];
452        while ($q->select($joiners) > 0) { 
453          $named = $joiners[count($joiners) - 1];
454          $named->place = clone $place;
455          $named->place->localdistancefrom($named);
456
457          if ($named->place->distance > $toofar) { break; } // everywhere else is further too
458
459          unset($named->placenearer);
460          $named->findnearestplace(/* other than... */ $place);
461          if (! empty($named->placenearer) && 
462              $named->place->distance < $named->placenearer->distance) 
463          {
464            unset($named->placenearer);
465          }
466          $named->assigndescription($nterms > 1);
467          $nameds[] = clone $named; 
468        }
469
470        // $db->log ("found names near those places " . print_r($nameds, 1));       
471      }
472    }
473
474    if (! $recursive && $nterms > 1) {
475      /* look for place qualified by is_in (do it this way because
476         there are places e.g called "England" where you may or may
477         not find the first term) */
478      $qualified_terms = array_merge(array($terms[0]), $terms);
479      $qualified_nameds = search::find($qualified_terms, $maxresults,
480                                      $postcodelookup, $anyoccurenceifnotlocal, TRUE);
481      if (! is_string($qualified_nameds)) {
482        $nameds = array_merge($qualified_nameds, $nameds);
483      }
484    }
485
486    if (count($nameds) == 0 && (count($places) == 0 || $anyoccurenceifnotlocal) && 
487        empty($doinglatlonqualifier)) 
488    {
489      /* Either no qualifying place, or no name found near given place
490         (and we asked to search more widely). If there was a
491         qualifying place try general search for name anyway: "but I
492         did find one near..."
493
494         In this case we have no place to order by distance from, so
495         instead do a partial ordering so that exact matches on the
496         name (or one of its abbreviated variants) come first and then
497         partial matches. For example, "Fulbourn" would come before
498         "Fulbourn Post Office" when searching for "Fulbourn". We do
499         this by going round the loop twice, relaxing the exactness
500         condition on the second time round
501
502         ... Well, that's what I used to do. In the interests of
503         eifficiency, however, for now just do inexact matches. We'll
504         still get places first, but a search for Bury will include
505         Bury St Edmunds whereas before that would have been well down
506         the list, after all the other Burys */
507      $limit = $maxresults;
508      $exact = FALSE; // TRUE;
509      for ($i = 0; $i < 1 /* 2 */ && $limit > 0; $i++) {
510        $q = $db->query();
511        $q->where(word::whereword($joiners, $canonterms, $exact));
512        $q->limit($limit);
513        // $condition = y_op::aand($condition, y_op::le('rank',named::placerank('city')));
514        /* prioritise places, and those in order of importance, cities first */
515        $q->descending('rank');
516        // $q->groupby(y_op::field('id',count($joiners)-1));
517
518        while ($q->select($joiners) > 0) { 
519          // $db->log(print_r($joiners,1));
520          // $db->log(print_r($named,1));
521          $named = $joiners[count($joiners) - 1];
522          $namedclone = clone $named;
523          if ($namedclone->rank > 0) {
524            $namedclone->assigncontext();
525            $namedclone->findnearestplace($namedclone, $namedclone->isolatedplaceneighbourranks());
526          } else {
527            $namedclone->findnearestplace();
528          }
529          // if (isset($unfoundplace)) { $namedclone->place = $unfoundplace; }
530          $namedclone->assigndescription($nterms > 1);
531          $nameds[] = $namedclone;
532        }
533        $limit -= count($nameds);
534        $exact = FALSE;
535      }
536
537      // $db->log ("found names near other places " . print_r($nameds, 1));       
538    }
539
540    /* cull duplicate responses. These are usually because it found the name near more than
541       one place which matched the place name criterion */
542    $namedsunique = array();
543    foreach ($nameds as $named) {
544      if (! array_key_exists( $named->id, $namedsunique)) {
545        $namedsunique[$named->id] = $named;
546      }
547    }
548
549    return array_values($namedsunique);
550  }
551
552  // --------------------------------------------------
553  /* static */ function explodeterms($terms) {
554    /* Helper function to expand a complete search string into an array of terms
555       suitable for the find function above */
556    $terms = str_replace(' near ',',',$terms);
557    $terms = explode(',', $terms);
558    for ($i = 0; $i < count($terms); $i++) { $terms[$i] = trim($terms[$i]); }
559    return $terms;
560  }
561
562  // --------------------------------------------------
563  /* private static */ function islatlon($term1, $term2, &$pseudoplace) {
564    /* Returns a boolean according to whether term1 and term2 (both
565       strings, separate because the comma betwen them caused them to
566       be separated), are both decimal numbers, and therefore together
567       form a latituelongitude pair. If s, constructs and returns in
568       $pseudoplace an anonymous, artificial named which is located
569       atthe lat/lon determined */
570    static $anonid = 0;
571    if (preg_match('/^-?([0-9]+|[0-9]*\\.[0-9]+)$/', $term1) &&
572        preg_match('/^-?([0-9]+|[0-9]*\\.[0-9]+)$/', $term2)) {
573      $lat = (double)$term1;
574      $lon = (double)$term2;
575      $pseudoplace = new named();
576      $pseudoplace->category = 'place';
577      $pseudoplace->lat = $lat;
578      $pseudoplace->lon = $lon;
579      $pseudoplace->name = '';
580      $anonid -= 10;
581      $pseudoplace->id = $anonid; 
582      $pseudoplace->rank = named::placerank('city'); // hmm
583      $pseudoplace->info = 'requested location';
584      return TRUE;
585    }
586    return FALSE;
587  }
588
589}
590
591?>
Note: See TracBrowser for help on using the repository browser.