Ticket #2390: import.php

File import.php, 15.9 KB (added by woidrick, 10 years ago)

hack, sorry, cant use diff

Line 
1<?php
2
3/* This file takes an OSM planet file, or a planet diff, and processes
4   it to update the name finder database noting what has changed. A
5   further program then updates the index from the total information
6   this program generates.
7
8   The process takes many hours to run, so it is better to run it on a
9   temporary database and then export the tables (other than node and
10   way) and import them into the production database, which takes only
11   maybe a minute or so.  */
12
13session_start(); // only so we get a unique log file
14
15include_once('preamble.php');
16include_once('named.php');
17include_once('changedid.php');
18include_once('placeindex.php');
19include_once('region.php');
20include_once('canonical.php');
21include_once('word.php');
22
23include_once('tagged.php');
24include_once('node.php');
25include_once('way.php');
26include_once('relation.php');
27include_once('way_node.php');
28include_once('relation_node.php');
29include_once('relation_way.php');
30include_once('relation_relation.php');
31include_once('changeset.php');
32
33$added['node'] = $added['way'] = $added['relation'] = $added['placeindex'] = 
34$added['named'] = $added['canonical'] = 0;
35
36$tooclose = $config['tooclose'];
37
38// ==================================================
39class nodecache {
40  /* we maintain a node cache as we create nodes and use them because we frequently
41     refer to the same node a few times in quick succession from say a way or relation */
42  var $cache = array();
43  var $size = 0;
44  var $maxsize = 500;
45
46  function getnode($uid) {
47    if (isset($this->cache[$uid])) {
48      return $this->cache[$uid];
49    }
50    global $db;
51    $node = new node($uid, TRUE);
52    if ($db->select($node) > 0) {
53      $this->putnode($node, $uid);
54      return $node;
55    }
56    return FALSE;
57  }
58 
59  function putnode($node, $uid) {
60    if ($this->size == $this->maxsize) {
61      array_shift($this->cache);
62    } else {
63      $this->size++;
64    }
65    $this->cache[$uid] = clone $node;
66  }
67}
68
69// ==================================================
70/* the main program... */
71
72function zap($c) {
73  // empty the table whose name is given
74  global $db;
75  echo "zapped {$c} ", $db->truncate($c), "\n";
76}
77
78// --------------------------------------------------
79
80if (! isset($argv[1])) { die ("usage import.php planet_or_osc_filename\n"); }
81$planetfilename = $argv[1];
82$augment = $planetfilename{0} == '+';
83if ($augment) { $planetfilename = substr($planetfilename, 1); }
84
85/* extract the date from the planet file name so we can note the index date in the database */
86if (preg_match('/-(20)?([0-9]{2})-?([0-9]{2})-?([0-9]{2})\\./', $planetfilename, $matches)) {
87  $planetdate = "20{$matches[2]}-{$matches[3]}-{$matches[4]}";
88  echo "for file date {$planetdate}\n";
89}
90if (preg_match('/\\.bz2$/i', $planetfilename)) {
91  // stream_filter_append($planetfd, 'bzip2.decompress', STREAM_FILTER_READ);
92  $planetfd = bzopen($planetfilename, 'r');
93  $planetread = 'bzread';
94  $planetclose = 'bzclose';
95} else if (preg_match('/\\.gz$/i', $planetfilename)) {
96  // stream_filter_append($planetfd, 'zlib.inflate', STREAM_FILTER_READ);
97  $planetfd = gzopen($planetfilename, 'r');
98  $planetread = 'gzread';
99  $planetclose = 'gzclose';
100} else {
101  $planetfd = fopen($planetfilename, 'r');
102  $planetread = 'fread';
103  $planetclose = 'fclose';
104}
105if ($planetfd === FALSE) { die("cannot open '{$planetfilename}'\n"); }
106
107$planetsize = filesize($planetfilename);
108if($planetsize < 0) { $planetsize = pow(2.0, 32) + $planetsize; }
109echo "planet size {$planetsize}\n";
110$planetdonepermill = 0;
111
112$starttime = time();
113echo "started at ", date("H:i:s", $starttime), "\n";
114
115/* start clean for updated items */
116if (! $augment) {
117  zap('canonical');
118  zap('changedid');
119}
120
121$doing_delete = FALSE;
122$doing_modify = FALSE;
123
124/* update the index date */
125include_once('options.php');
126$options = new options();
127$options->name = 'indexdate';
128$db->delete($options, 'name');
129if (isset($planetdate)) {
130  $options->value = $planetdate;
131  $db->insert($options);
132}
133
134/* the planet file is too big to do a filesize call on (> 4Gb) so
135   instead set off a grep to find out how many elements there are and
136   do progress based on proportion of elements. We'll check back later
137   to see if this has finished */
138if (0) {
139$elementcheckfile = '/tmp/osmimportgrep';
140file_put_contents('', $elementcheckfile);
141exec ("grep -c \"\\(<node \\|<way \\|<relation \\)\" {$planetfilename} > {$elementcheckfile} &");
142}
143$elementcount = $currentelementcount = 0; 
144
145/* initialise the node cache */
146$nc = new nodecache();
147
148/* start the xml parser. This makes callbacks when it sees the start and end of each element */
149$xml_parser = xml_parser_create();
150xml_set_element_handler($xml_parser, "startelement", "endelement");
151
152$bytesread = 0.0;
153$onemb = 1024.0 * 1024.0;
154$mb = 0;
155while ($data = $planetread($planetfd, 4096)) {
156  $bytesread += 4096.0;
157  if ($bytesread > $onemb) {
158    $mb += 1;
159    echo "\n", with_commas($mb), "Mb\n";
160    $bytesread -= $onemb;
161  }
162  if (!xml_parse($xml_parser, $data, feof($planetfd))) {
163    die(sprintf("XML error: %s at line %d",
164                xml_error_string(xml_get_error_code($xml_parser)),
165                xml_get_current_line_number($xml_parser)));
166  }
167}
168xml_parser_free($xml_parser);
169$planetclose($planetfd);
170
171echo "added:\n";
172foreach ($added as $class => $count) {
173  echo {$class}: {$count}\n";
174}
175
176$endtime = time();
177echo "finished at ", date("H:i:s", $endtime), "\n";
178$seconds = $endtime - $starttime;
179$minutes = (int)($seconds/60);
180$seconds -= $minutes*60;
181$hours = (int)($minutes/60);
182$minutes -= $hours*60;
183printf("took %d:%02d:%02d\n", $hours, $minutes, $seconds);
184
185/* and we're done */
186
187// ==================================================
188/* XML parser callbacks
189
190   $tagged is the current node, way or relation we are processing, where tags go
191   when we encounter them, and where nodes of ways and members of relations go */
192
193function startelement($parser, $name, $attrs)
194{
195  global $tagged, $elementcount, $elementcheckfile, $currentelementcount, $added;
196  global $doing_delete, $doing_modify;
197  static $seenway = FALSE;
198  static $seenrelation = FALSE;
199  static $lastelementcheck = 0;
200  static $lastpermill = 0;
201
202  /* Do progress information */
203
204   /*
205  if ($elementcount == 0) {
206    $elementcheck = time();
207    if ($elementcheck - $lastelementcheck > 60) {
208      echo "checking {$elementcheckfile}\n";
209      $lastelementcheck = $elementcheck;
210      clearstatcache();
211      $size = filesize($elementcheckfile);
212      if ($size > 0) {
213        $elementcount = file_get_contents($elementcheckfile);
214        $elementcount = (int) $elementcount;
215        echo "noted {$elementcount} elements\n";
216      }
217    }
218  } else {
219    $permill = (int) floor(($currentelementcount * 1000.0 / $elementcount));
220    if ($permill != $lastpermill) {
221      $dp = $permill % 10;
222      echo (int)floor($permill/10), '.', $dp, "% ";
223      if ($dp == 9) { echo "\n"; }
224      $lastpermill = $permill;
225    }
226  }
227  */
228
229  /* What we do depends on element type... */
230
231  switch($name) {
232  case 'DELETE':
233    $doing_delete = TRUE;
234    $doing_modify = FALSE;
235    break;
236  case 'MODIFY':
237    $doing_delete = FALSE;
238    $doing_modify = TRUE;
239    break;
240  case 'ADD':
241  case 'CREATE':
242    $doing_delete = FALSE;
243    $doing_modify = FALSE;
244    break;
245
246  case 'TAG':
247    /* Disribute tags to the object which encloses the tag elements */
248    if (empty($tagged)) { die("no tagged ".$parser." ".$name." ".$attrs['V']."\n"); }
249    if (!isset($attrs['K']) || !isset($attrs['V'])) { 
250      die("no tag K or V".print_r($tagged,1)."\n".print_r($attrs,1)); }
251    $tagged->add_tag($attrs['K'], $attrs['V']);
252    break;
253  case 'ND': /* constituent node of way */
254    if ($doing_delete) { break; }
255    if (empty($tagged)) { die("no tagged\n"); }
256    if (! isset($attrs['REF'])) { 
257      echo("no node REF".print_r($tagged,1).print_r($attrs,1)); 
258    }  else {
259      $tagged->add_node((int)$attrs['REF']);
260    }
261    break;
262  case 'MEMBER': /* constituent member of relation */
263    if ($doing_delete) { break; }
264    if (empty($tagged)) { die("no tagged\n"); }
265    if (! isset($attrs['REF']) || ! isset($attrs['TYPE'])) { 
266      echo("no node REF".print_r($tagged,1).print_r($attrs,1)); 
267    }  else {
268      switch ($attrs['TYPE']) {
269      case 'node':
270        $tagged->add_node((int)$attrs['REF']);
271        break;
272      case 'way':
273        $tagged->add_way((int)$attrs['REF']);
274        break;
275      case 'relation':
276        break;
277      default:
278        echo("unrecognised TYPE".print_r($tagged,1).print_r($attrs,1)); 
279      }
280    }
281    break;
282
283  case 'NODE':
284    if (! empty($tagged)) { die("tagged active\n".print_r($tagged,1)); }
285    if (empty($attrs['ID'])) { die("no node ID"); }
286    $tagged = new node((int)$attrs['ID']);
287    if (! isset($attrs['LAT']) || ! isset($attrs['LON'])) { 
288      die("no node LAT/LON".print_r($tagged,1)); }
289    $tagged->set_latlon((double)$attrs['LAT'], (double)$attrs['LON']);
290    $currentelementcount++;
291    break;
292  case 'CHANGESET':
293    if (! empty($tagged)) { die("tagged active\n".print_r($tagged,1)); }
294    if (empty($attrs['ID'])) { die("no changeset ID"); }
295    $tagged = new changeset((int)$attrs['ID']);
296    if (! isset($attrs['MIN_LON']) || ! isset($attrs['MAX_LON'])
297     || ! isset($attrs['MIN_LAT']) || ! isset($attrs['MAX_LAT'])) { 
298    //  die("no changeset man/max LAT/LON".print_r($tagged,1));
299    }
300    else {
301    $tagged->set_latlon((((double)$attrs['MIN_LAT']+(double)$attrs['LAT'])/2), (((double)$attrs['MIN_LON']+(double)$attrs['MAX_LON'])/2));
302    }
303    $currentelementcount++;
304    break;
305  case 'WAY':
306    if (! $seenway) {
307      /* first way ... */
308      echo "\nstarting ways at " . date("H:i:s") . "\n";
309      $seenway = TRUE;
310    }
311    if (! empty($tagged)) { die("tagged active\n".print_r($tagged,1)); }
312    if (empty($attrs['ID'])) { die("no way ID"); }
313    $tagged = new way((int)$attrs['ID']);
314    $currentelementcount++;
315    break;
316  case 'RELATION':
317    if (! $seenrelation) {
318      /* first relation ... */
319      echo "\nstarting relations at " . date("H:i:s") . "\n";
320      $seenrelation = TRUE;
321    }
322    if (! empty($tagged)) { die("tagged active\n".print_r($tagged,1)); }
323    if (empty($attrs['ID'])) { die("no relation ID"); }
324    $tagged = new relation((int)$attrs['ID']);
325    $currentelementcount++;
326    break;
327  }
328}
329
330// --------------------------------------------------
331function endelement($parser, $name)
332{
333  /* called at the end of the element. Call the relevant method according to node type */
334  global $tagged, $doing_delete, $doing_modify, $added;
335  global $db;
336  static $lastadded = 0;
337
338  switch($name) {
339  case 'NODE':
340  case 'WAY':
341  case 'RELATION':
342  case 'CHANGESET':
343    /* ignore coastlines */
344    if (isset($tagged->tags['natural']) && $tagged->tags['natural'] == 'coastline') { 
345      $tagged = null;
346      return; 
347    }
348
349    /* analyse the object */
350    $tagged->interesting_name();
351
352    /* we have four possibilities:
353       1. simple addition: as per a raw planet file
354       2. deletion: the object is going away for ever
355       3. modification(1): the object does not already exist - the diffs only say modified
356          so we can't tell - so treat this as addition
357       4. modification(2): the object does exist, so treat as deletion followed by addition
358
359       In all cases we need to note that the canonical names have changed so
360       that we can reprocess them; in case 4, boththe old canonical and the new one
361       have changed.
362    */
363
364    $do_delete = $doing_delete;
365
366    if ($doing_modify) {
367      /* if the object already exists, as well as deleting it below, add the *old*
368         canonical form to the list of those changed, so that if
369         there are others with the same canonical nearby they will
370         eventually be revealed in place of this one */
371      $named = new named();
372      $named->id = $tagged->id;
373      $q = $db->query();
374      while ($q->select($named) > 0) {
375        $do_delete = TRUE;
376        if (empty($named->canonical)) { continue; }
377        $canonical = new canonical($named->canonical, $named->region);
378        $db->insert($canonical);
379      }
380    }
381
382    if ($doing_modify || $doing_delete) {
383      /* a node that is changed or deleted that is part of a way or
384         relation or a way that is part of a relation will affect its
385         parent - e.g. a node moving may change the location of a way,
386         and that in turn may affect other similar names nearby as
387         well as the parent. For example node gets deleted, owning way
388         shifts north, section of the same road, but different way,
389         further south is now more than 3km from the original way, so
390         gets added to the index. This kind of indirect effect is what
391         makes updating the index incrementally so problematic, and
392         why it is done on the basis of changes affecting the
393         canonical string, not by id */
394      $parent_ids = $tagged->parent_ids();
395      if (! empty($parent_ids)) {
396        $named = new named();
397        $ors = array();
398        foreach ($parent_ids as $id) { $ors[] = y_op::eq('id',$id); }
399        $q = $db->query();
400        $q->where(count($ors) == 1 ? $ors[0] : y_op::oor($ors));
401        while ($q->select($named) > 0) {
402          if (empty($named->canonical)) { continue; }
403          $canonical = new canonical($named->canonical, $named->region);
404          $db->insert($canonical);
405        }       
406      }
407    }
408
409    /* delete object, named's whether a delete or a
410       modify (we'll add modified object back again in a moment). There may not be
411       any index words to delete, if the named was not already in the index but no matter */
412    if ($do_delete) {
413      $tagged->delete();
414      if (deletebyid('named', $tagged->id)) {
415        deletebyid('word', $tagged->id);
416        deletebyid('placeindex', $tagged->id);
417      }
418    }
419
420    /* whether we delete, add or modify, the canonical form we made
421       is changed, so add it to the table of changed canonicals. In
422       cases of deletion only, if there are others with the same
423       canonical nearby they will eventually be revealed in place of
424       this one by virtue of these entries */
425    if (! empty($tagged->named)) {
426      if (! empty($tagged->named->canonical)) {
427        $canonical = new canonical($tagged->named->canonical, $tagged->named->region);
428        $db->insert($canonical);
429        $added['canonical']++;
430      } else {
431        /* for anonymous items that we still want to index by type, keep a record
432           of their id for the second pass */
433        $changedid = new changedid($tagged->id);
434        $db->insert($changedid);
435      }
436    }
437
438    if (! $doing_delete) {
439      /* add the object, named (duplicated for as many different canonical forms
440         as required) */
441      $tagged->insert();
442      if (! is_null($tagged->named)) {
443        $db->insert($tagged->named);
444        $added['named']++;
445      }
446    }
447
448    $class = get_class($tagged);
449    $n = $added[$class];
450    if ($n % 1000 == 0 && $n != $lastadded) { echo with_commas($n), ' '; $lastadded = $n; }
451
452    $tagged = NULL;
453    break;
454  }
455
456}
457
458// --------------------------------------------------
459function with_commas($n) {
460  $divisor = 1000000000;
461  $s = '';
462  $pattern = '%d,';
463  while ($divisor > 1) {
464    $section = (int)($n/$divisor);
465    if ($section > 0) {
466      $s .= sprintf($pattern, $section);
467      $pattern = '%03d,';
468      $n = $n % $divisor;
469    }
470    $divisor /= 1000;
471  }
472  $pattern = substr($pattern, 0, -1);
473  $s .= sprintf($pattern, $n);
474  return $s;
475}
476
477// --------------------------------------------------
478function deletebyid($class, $id) {
479  /* deletion is very slow, so it's worth checking whether any exists first */
480  global $db;
481  $o = new $class();
482  $o->id = $id;
483  $q = $db->query();
484  $q->limit(1);
485  if ($q->select($o) == 0) { return FALSE; }
486  $db->delete($o, 'id');
487  return TRUE;
488}
489
490?>