source: subversion/sites/namefinder/php/postcodelookup.php @ 17445

Last change on this file since 17445 was 12160, checked in by david, 11 years ago

fix a subtle little bug whereby I was appending an object to the search string rather than a field of that object. How did it ever work?

File size: 6.9 KB
Line 
1<?php
2
3class postcodelookup {
4
5  /* This helper class consults google to see if it can find an
6     address for a postcode.  The address is then fed back into the
7     name search Make an object of this class using
8     postcodelookup::postcodelookupfactory: you can then retrieve the
9     query to pass on to the name finder using get_query method (which
10     will do the lookup) on the postcodelookup object */
11
12  /* http://www.google.com/search?hl=en&q=%22PE27+5JP%22 */
13
14  var $postcode;
15  var $postcodeprefix;
16  var $textbefore;
17  var $prefixonly;
18  var $googlequery = 'http://www.google.com/search?hl=en&q=';
19  var $namefinderquery;
20
21  // --------------------------------------------------
22  /* static */ function postcodelookupfactory($prospectivepostcode, $allowtextbefore=FALSE) {
23    $prepattern = $allowtextbefore ? '^(.*) +' : '^ +()';
24    $postcodelookup = new postcodelookup();
25    $prospectivepostcode = " {$prospectivepostcode}";
26    if (preg_match ("/{$prepattern}([a-z]{1,2}[0-9]{1,2}[a-z]?) +([0-9][a-z]{2})\$/i", 
27                    $prospectivepostcode, $matches)) 
28    {
29      $prefix = strtoupper($matches[2]);
30      $postcodelookup->postcode = strtoupper("{$matches[2]} {$matches[3]}");
31      $postcodelookup->prefixonly = FALSE;
32      $postcodelookup->textbefore = $matches[1];
33    } else if (preg_match("/{$prepattern}([A-Z]{1,2}[0-9]{1,2}[A-Z]?)\s*\$/i", 
34                          $prospectivepostcode, $matches)) 
35    {
36      /* it is just the first part of a UK postcode? */
37      $postcodelookup->postcode = $prefix = $matches[2];
38      $postcodelookup->textbefore = $matches[1];
39      $postcodelookup->prefixonly = TRUE;
40    } else {
41      return FALSE; /* not a postcode */
42    }
43    include_once('postcodeprefix.php');
44    $postcodelookup->postcodeprefix = postcodeprefix::lookup($prefix);
45    if (empty($postcodelookup->postcodeprefix)) { return FALSE; }
46    if ($postcodelookup->prefixonly) {
47      $postcodelookup->namefinderquery = $postcodelookup->postcodeprefix->prefix;
48    } else if (! $postcodelookup->get_query()) {
49      return FALSE;
50    }
51    return $postcodelookup;
52  }
53
54  // --------------------------------------------------
55  function get_query() { 
56    if (empty($this->postcode)) { return FALSE; }
57    $this->googleme();
58    if (is_null($this->namefinderquery)) { return FALSE; }
59    return TRUE;
60  }
61
62  // --------------------------------------------------
63  /* private */ function match($subject) {
64    include_once('named.php');
65    static $roadetc = 'road|rd|street|st|lane|ln|place|pl|avenue|ave|crescent|cres|close|cl|way|wy|drive|dr|walk|park|pk|row|hill|parade|pde|terrace|tce|court|ct|mews|grove|rise|fields|meadows|path|green|gn|gardens|gdns|garden|gdn|gate';
66    static $namechars = "[a-z \\\\'\\\\.\\\\-]";
67    $subject = strip_tags(preg_replace('~\\<br\\ *\\/?\\>~i', ',', $subject));
68    $roadsection = " *({$roadetc})";
69
70    /* lat,lon to costrain place checks to uk: just clips northern France, and part of RoI,
71       but it is only to slim down the search */
72    $uk = array(49.9,-8.1,61.0,2.0);
73    global $db; 
74
75    /* hash of places to avoid repeatedly looking up the same place */
76    $triedplaces = array();
77   
78    for ($i = 0; $i < 2; $i++) {
79      $nmatches = preg_match_all ("/[^0-9][0-9]{1,3}[\\, ] *({$namechars}{1,30}{$roadsection})\\.?\\, *({$namechars}{1,})(\\, *({$namechars}{1,}))?(\\, *({$namechars}{1,}))?(\\, *({$namechars}{1,}))?\\,? *{$this->postcode}/i", $subject, $matches, PREG_SET_ORDER);
80     
81      foreach($matches as $submatches) {
82        /* $submatches[1] is the street, $submatches[2] is road, rd
83           etc, let's see if $submatches[3] etc is a known place */
84        for ($k = 3; $k < 9; $k++) {
85          if (! empty($submatches[$k]) && $submatches[$k]{0} != ',') {
86            $possibleplace = $submatches[$k];
87            if (! array_key_exists($possibleplace, $triedplaces)) {
88              $places = named::lookupplaces($possibleplace, $uk, TRUE /* exact match */);
89              $triedplaces[$possibleplace] = TRUE;
90            }
91            if (! empty($places)) { break; }
92          }
93        }
94        if (empty($places)) { continue; }
95        $this->namefinderquery = "{$submatches[1]}, {$possibleplace}, {$this->postcodeprefix->prefix}";
96
97        $db->log("looking up {$this->namefinderquery} for {$this->postcode} ". print_r($submatches,1));
98        return TRUE;
99      }
100
101      /* nothing helped, so relax the search so it doesn't include things like 'Road',
102         e.g. 47 The Brambles, Somwhereville, SG8 1TX */
103      $roadsection = '';
104    }
105
106    /* now try it without a number. But to isolate the place from among the four clauses
107       try looking up each prospective place in the place index, and take the clause before
108       it as the street (or sometimes the business name:
109       eg 'University of East Anglia, Norwich, NR4 7TJ' */
110
111    if (preg_match_all ("/({$namechars}{1,})\\, *({$namechars}{1,30})(\\, *({$namechars}{1,30}))?(\\, *({$namechars}{1,30}))?\\,? *{$this->postcode}/i", $subject, $matchesall, PREG_SET_ORDER))
112    {
113      $db->log("considering: ".print_r($matchesall,1));
114
115      foreach ($matchesall as $matches) {
116        for($j = 2; $j < count($matches); $j++) {
117          $possibleplace = trim($matches[$j]);
118          if (empty ($possibleplace) || $possibleplace{0} == ',') { continue; }
119          if (empty($possibleplace)) { continue; }
120          if (array_key_exists($possibleplace, $triedplaces)) { continue; }
121          $places = named::lookupplaces($possibleplace, $uk, TRUE /* exact match */);
122          $triedplaces[$possibleplace] = TRUE;
123          if (empty($places)) { continue; }
124          $db->log("places found for {$possibleplace}: ".print_r($places,1));
125          $address = $matches[$j-2];
126          $this->namefinderquery = "{$address}, {$possibleplace}";
127          $db->log("looking up {$this->namefinderquery} for {$this->postcode} ". 
128                   print_r($matches,1));
129          return TRUE;
130        }
131      }
132    }
133
134    return FALSE;
135  }
136
137  // --------------------------------------------------
138  /* private */ function googleme() {
139    $qs = $this->googlequery . urlencode("\"{$this->postcode}\"");
140    $googleresult = file_get_contents($qs);
141    if ($googleresult === FALSE) { return; }
142    if ($this->match($googleresult)) { return; }
143
144    /* try the cached pages if we didn't find it on the search results page */
145    preg_match_all ("~\\<a .*href=[\"\']([^\"\']*)[\"\'][^\\>]*\\>cached\\<\\/a\\>~i",
146                    $googleresult,
147                    $matches, PREG_PATTERN_ORDER);
148    $caches =& $matches[1];
149    for ($cn = 0; $cn < count($caches); $cn++) {
150      $cacheresult = file_get_contents($caches[$cn]);
151      if ($cacheresult === FALSE) { return; }
152      if ($this->match($cacheresult)) { 
153        global $db; $db->log ("found in cache {$cn}: {$caches[$cn]}");
154        return;
155      }
156    }
157  }
158
159}
160
161?>
Note: See TracBrowser for help on using the repository browser.