source: subversion/applications/utils/export/osm2csv/osm2csv-segments.pl @ 4755

Last change on this file since 4755 was 4755, checked in by joerg, 13 years ago

New Version to create segment list. With 'only' 500MB RAM usage

  • Property svn:executable set to *
File size: 10.9 KB
Line 
1#!/usr/bin/perl
2
3BEGIN {
4    my $dir = $0;
5    $dir =~s,[^/]+/[^/]+$,,;
6    unshift(@INC,"$dir/../perl_lib");
7
8    unshift(@INC,"../perl_perl_lib");
9    unshift(@INC,"~/svn.openstreetmap.org/applications/utils/perl_lib");
10    unshift(@INC,"$ENV{HOME}/svn.openstreetmap.org/applications/utils/perl_lib");
11}
12
13
14use strict;
15use warnings;
16
17use XML::Parser;
18use Getopt::Long;
19use Storable ();
20use IO::File;
21use Pod::Usage;
22use Data::Dumper;
23
24use Geo::Filter::Area;
25use Geo::OSM::Planet;
26use Utils::Debug;
27use Utils::File;
28use Utils::LWP::Utils;
29use Utils::Math;
30use File::Slurp;
31
32sub parse_planet($$); # {}
33
34our $man=0;
35our $help=0;
36my $areas_todo;
37my $do_list_areas=0;
38my $do_update_only=0;
39my $tie_nodes_hash=undef;
40my $Filename;
41
42Getopt::Long::Configure('no_ignore_case');
43GetOptions ( 
44             'debug+'              => \$DEBUG,     
45             'd+'                  => \$DEBUG,     
46             'verbose+'            => \$VERBOSE,
47             'MAN'                 => \$man, 
48             'man'                 => \$man, 
49             'h|help|x'            => \$help, 
50
51             'tie-nodes-hash'      => \$tie_nodes_hash,
52             'no-mirror'           => \$Utils::LWP::Utils::NO_MIRROR,
53             'proxy=s'             => \$Utils::LWP::Utils::PROXY,
54             'osm=s'               => \$Filename,
55             'area=s'              => \$areas_todo,
56             'list-areas'          => \$do_list_areas,
57             'update-only'         => \$do_update_only,
58             )
59    or pod2usage(1);
60
61$areas_todo ||= 'germany';
62$areas_todo=lc($areas_todo);
63
64# See if we'll have to tie the Nodes Hash to a File
65# This is at least 10 times slower, but we have less problems with
66# running out of memory
67if ( ! defined $tie_nodes_hash ) {
68    my $max_ram=mem_info("MemTotal");
69    $max_ram =~ s/MB//;
70    my $estimated_memory = {
71        africa     => 2500,
72        france     =>  192,
73        europe     => 3000,
74        germany    =>  500,
75        uk         =>  660,
76        world      => 4000,
77        world_east => 4000,
78        world_west => 4000,
79    };
80    for my $area ( split(",",$areas_todo )){
81        $tie_nodes_hash=1
82            if $estimated_memory->{$area} > $max_ram;
83     }
84}
85
86pod2usage(1) if $help;
87pod2usage(-verbose=>2) if $man;
88
89if ( $do_list_areas ) {
90    print Geo::Filter::Area->list_areas()."\n";
91    exit;
92}
93
94# TODO:
95# if the input filename is not planet*osm* we have to change the output filename too.
96$Filename ||= shift();
97unless ( $Filename && -s $Filename ) {
98    $Filename = mirror_planet();
99};
100if ( ! -s $Filename ) {
101    die "Cannot read $Filename\n";
102}
103
104pod2usage(1) unless $Filename;
105
106our $READ_FH=undef;
107our $OK_POS=0;
108
109
110our (%MainAttr,$Type,%Tags);
111# Stored data
112our (%Nodes, %Stats);
113our $AREA_FILTER;
114our $PARSING_START_TIME=0;
115our $PARSING_DISPLAY_TIME=0;
116
117my $data_dir=planet_dir()."/csv";
118mkdir_if_needed( $data_dir );
119
120for my $area_name ( split(",",$areas_todo) ) {
121    if ( $do_update_only ) {
122        my $needs_update=0;
123        $needs_update ||= file_needs_re_generation($Filename,"$data_dir/osm-segents-$area_name.csv");
124        next unless $needs_update;
125        print STDERR "Update needed. One of the files is old or non existent\n" if $VERBOSE;
126    }
127    # -----------------------------------------------------------------------------
128    # Temporary data
129
130    (%MainAttr,%Tags)=((),());
131    $Type='';
132    (%Nodes, %Stats)=((),());
133
134    # Currently active Area Filter
135    $PARSING_START_TIME=0;
136    # Estimated Number of elements to show progress while reading in percent
137    for my $type ( qw(elem tag node segment )) {
138        $Stats{"${type} estim"} = estimated_max_count($type);
139        $Stats{"${type} seen"}=0;
140        $Stats{"${type} read"}=0;
141    }
142
143    #----------------------------------------------
144    # Processing stage
145    #----------------------------------------------
146
147    print STDERR "creating $data_dir/osm-segments-$area_name.csv\n" if $VERBOSE;
148
149    my $base_filename="$data_dir/osm-segments-$area_name";
150
151    if ( $tie_nodes_hash ) {
152        # maybe we should move this file to /tmp
153        # and lock it, and delete it in an END {} -Block
154        print STDERR "Tie-ing Nodes Hash to '$base_filename-Nodes.db'\n";
155        dbmopen(%Nodes,"$base_filename-Nodes.db",0666) 
156            or die "Could not open DBM File '$base_filename-Nodes.db': $!";
157    }
158    $Stats{"Tie Nodes_hash"} = $tie_nodes_hash;
159
160    my $filename = "$data_dir/osm-segments-$area_name.csv";
161    if(! open(OSM,">$filename.part")) {
162        warn "output_osm: Cannot write to $filename\n";
163        return;
164    }
165    binmode(OSM,":utf8");
166    parse_planet($Filename,$area_name);
167
168    printf STDERR "Creating output files\n";
169    die "No Area Name defined\n"
170        unless $area_name;
171
172    rename("$filename.part",$filename)
173        if -s "$filename.part";
174
175    printf STDERR "$area_name Done\n";
176}
177exit;
178
179
180sub percent_string($$){
181    my $part = shift;
182    my $full = shift;
183    my $erg = "";
184    $erg = sprintf("%.0f%%",(100*$part/$full)) if $full;
185    return $erg;
186}
187
188#----------------------------------------------
189# Parsing planet.osm File
190#----------------------------------------------
191sub parse_planet($$){
192    my $Filename = shift;
193    my $area_name = shift;
194
195    print STDERR "Reading and Parsing XML from $Filename for $area_name\n" if $DEBUG|| $VERBOSE;
196
197    $AREA_FILTER = Geo::Filter::Area->new( area => $area_name );
198
199    $PARSING_START_TIME=time();
200    $READ_FH = data_open($Filename);
201    my $P = new XML::Parser( Handlers => {
202        Start => \&DoStart, 
203        End => \&DoEnd, 
204        Char => \&DoChar,
205        });
206    eval {
207        $P->parse($READ_FH);
208        $READ_FH->close();
209    };
210    if ( $VERBOSE || $DEBUG )  {
211        print STDERR "\n";
212    }
213
214    # Print out not parsed lines
215    my $count=20;
216    $READ_FH->setpos($OK_POS);
217    while ( ($count--) && (my $line = $READ_FH->getline() )) {
218        print "REST: $line";
219    }
220
221    if ($@) {
222        print STDERR "WARNING: Could not parse osm data $Filename\n";
223        print STDERR "ERROR: $@\n";
224        return;
225    }
226    if (not $P) {
227        print STDERR "WARNING: Could not parse osm data $Filename\n";
228        return;
229    }
230    $Stats{"time parsing"} = time()-$PARSING_START_TIME;
231    printf("osm2csv: Parsing Osm-Data in %.0f sec\n",time()-$PARSING_START_TIME )
232        if $DEBUG || $VERBOSE;
233
234}
235
236
237# Function is called whenever an XML tag is started
238#----------------------------------------------
239sub DoStart()
240{
241    my ($Expat, $Name, %Attr) = @_;
242   
243    if($Name eq "node"){
244        undef %Tags;
245        %MainAttr = %Attr;
246        $Type = "n";
247    } elsif($Name eq "segment"){
248        undef %Tags;
249        %MainAttr = %Attr;
250        $Type = "s";
251    } elsif($Name eq "tag"){
252        # TODO: protect against id,from,to,lat,long,etc. being used as tags
253        $Tags{$Attr{"k"}} = $Attr{"v"};
254        $Stats{"tag"}++;
255    }
256}
257
258# Function is called whenever an XML tag is ended
259#----------------------------------------------
260sub DoEnd(){
261    my ($Expat, $Element) = @_;
262    my $ID = $MainAttr{"id"};
263    $Stats{"${Element} seen"}++;
264    $Stats{"elem seen"}++;
265    if ( defined( $Stats{"${Element} seen"} )
266         &&( $Stats{"${Element} seen"}== 1 ) ){
267        $Stats{"memory at 1st $Element rss"} = sprintf("%.0f",mem_usage('rss'));
268        $Stats{"memory at 1st $Element vsz"} = sprintf("%.0f",mem_usage('vsz'));
269        if ( $DEBUG >1 || $VERBOSE >1) {
270            print STDERR "\n";
271        }
272    }
273   
274    if (     $Stats{"elem seen"} >100 ) {
275        $READ_FH->close();
276    }
277
278    if($Element eq "node"){
279        if ( $AREA_FILTER->inside(\%MainAttr) ) {
280            $Nodes{$ID} = sprintf("%f,%f",$MainAttr{lat}, $MainAttr{lon});
281            $Stats{"node read"}++;
282            $Stats{"elem read"}++;
283        }
284    } elsif($Element eq "segment"){
285        my $from = $MainAttr{"from"};
286        my $to   = $MainAttr{"to"};
287        if ( defined($Nodes{$from}) && defined($Nodes{$to}) ) {
288            printf OSM "%s,%s\n",$from,$to;
289            $Stats{"segment read"}++;
290            $Stats{"elem read"}++;
291        }
292    } elsif($Element eq "way"){
293        #print STDERR "we're done\n";
294    }
295
296    if ( ( $VERBOSE || $DEBUG ) &&
297#        ! ( $Stats{"tags read"} % 10000 ) &&
298         ( time()-$PARSING_DISPLAY_TIME > 0.9)
299         )  {
300        $PARSING_DISPLAY_TIME= time();
301        print STDERR "\r";
302        print STDERR "Read(".$AREA_FILTER->name()."): ";
303        for my $k ( qw(elem node segment ) ) {
304            if ( $DEBUG>6 || $VERBOSE>6) {
305                print STDERR $k;
306            } else {
307                print STDERR substr($k,0,1);
308            }
309            print STDERR ":";
310            printf STDERR "%d read",$Stats{"$k read"};
311            printf STDERR "=%s",percent_string($Stats{"$k read"},$Stats{"$k seen"});
312
313            printf STDERR "(%d seen",($Stats{"$k seen"}||0);
314            printf STDERR "=%s",percent_string($Stats{"$k seen"},$Stats{"$k estim"});
315            print STDERR ") ";
316        }
317       
318        my $rss = sprintf("%.0f",mem_usage('rss'));
319        $Stats{"max rss"} = max($Stats{"max rss"},$rss) if $rss;
320        printf STDERR "max-rss %d" ,($Stats{"max rss"}) if $Stats{"max rss"} >$rss*1.3;
321        my $vsz = sprintf("%.0f",mem_usage('vsz'));
322        $Stats{"max vsz"} = max($Stats{"max vsz"},$vsz) if $vsz;
323        printf STDERR "max-vsz %d" ,($Stats{"max vsz"}) if $Stats{"max vsz"} >$vsz*1.3;
324
325        print STDERR mem_usage();
326        print STDERR time_estimate($PARSING_START_TIME,
327                                   $Stats{"node seen"}+ $Stats{"segment seen"},
328                                   $Stats{"node estim"}+ $Stats{"segment estim"},
329                                   );
330        print STDERR "\r";
331    }
332}
333# Function is called whenever text is encountered in the XML file
334#----------------------------------------------
335sub DoChar(){
336    my ($Expat, $String) = @_;
337}
338
339##################################################################
340# Usage/manual
341
342__END__
343
344=head1 NAME
345
346B<osm2csv-segments.pl> Version 0.02
347
348=head1 DESCRIPTION
349
350B<osm2csv-segments.pl> is a program to convert osm-segments from xml format to
351a plain text file in csv form.
352This format then is normally used by osmtrackfilter to compare against osm segments
353
354=head1 SYNOPSIS
355
356B<Common usages:>
357
358osm2csv.pl [-d] [-v] [-h] [--no-mirror] [--proxy=<proxy:port>] [--list-areas] <planet_filename.osm>
359
360=head1 OPTIONS
361
362=over 2
363
364=item B<--man> Complete documentation
365
366Complete documentation
367
368=item B<--proxy=<proxy:port>>
369
370Use proxy Server to get the newest planet.osm File
371
372=item B<--no-mirror>
373
374do not try to get the newest planet.osm first
375
376=item B<--osm=filename>
377
378Source File in OSM Format
379
380=item B<--area=germany> Area Filter
381
382Only read area for processing
383
384=item B<--list-areas>
385
386print all areas possible
387
388=item B<--tie-nodes-hash>
389
390if set we will tie the Nodes Hash to a File
391This is at least 10 times slower, but we have less problems with
392running out of memory.
393We have an internal list of estimated memory use and we'll try
394automgically to tie it if you don't have enough memory for a
395specified region.
396
397=item B<planet_filename.osm>
398
399the file to read from
400
401=back
402
403=head1 COPYRIGHT
404
405Copyright 2006, OJW
406
407This program is free software; you can redistribute it and/or
408modify it under the terms of the GNU General Public License
409as published by the Free Software Foundation; either version 2
410of the License, or (at your option) any later version.
411
412This program is distributed in the hope that it will be useful,
413but WITHOUT ANY WARRANTY; without even the implied warranty of
414MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
415GNU General Public License for more details.
416
417You should have received a copy of the GNU General Public License
418along with this program; if not, write to the Free Software
419Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
420
421=head1 AUTHOR
422
423OJW <streetmap@blibbleblobble.co.uk>
424Jörg Ostertag (osm2csv-for-openstreetmap@ostertag.name)
425
426=head1 SEE ALSO
427
428http://www.openstreetmap.org/
429
430=cut
Note: See TracBrowser for help on using the repository browser.