source: subversion/applications/utils/planet.osm/python/utf8osmchecker.py @ 20342

Last change on this file since 20342 was 1846, checked in by joerg, 13 years ago

set executable property

  • Property svn:executable set to *
File size: 4.2 KB
Line 
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright (C) 2006  Michael Strecke
5#
6# This program is free software; you can redistribute it and/or
7# modify it under the terms of the GNU General Public License
8# as published by the Free Software Foundation; either version 2
9# of the License, or (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program; if not, write to the Free Software
18# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
19#
20#
21# Version 0.1, 2006-08-15
22#
23# Quick and dirty scanner for non UTF-8 text in planet.osm
24#
25# The parser checks the XML for closing elements for node/segment/way elements.
26# If one is found outbut_buffer2 is called.
27# buffer2 stores the input since the last call of output_buffer2
28# If in the meantime a character does not meet the UTF-8 condition, it sets markedforoutput.
29# If this flag is set, output_buffer2 prints the content of buffer2 to the screen and
30# saves it in an error file.
31# The flag is reset and buffer2 cleared.
32
33import re, sys
34
35xmlfilename = "planet.osm"
36errorfile = "notutf.bin"
37
38last = ""                  # ID of last node/segment/way
39lasterror = ""             # ID of last offending node/segment/way
40counter = 0                # number of offending elements
41markedforoutput = False
42buffer2 = ""
43
44def error():
45   global last, lasterror, counter, markedforoutput
46   
47   if lasterror != last:   # only count one error per element
48      markedforoutput = True
49      counter += 1
50   lasterror = last
51   
52def output_buffer2():
53   global buffer2, markedforoutput, errout
54   if markedforoutput:
55      print buffer2
56      errout.write(buffer2)
57   markedforoutput = False
58   buffer2 = ""
59
60def procinside(buf):
61   global last, ma, ema, buffer2
62
63   buffer2 += "<" + buf + ">"
64
65   g = ma.match(buf)      # starting tag?
66   if g:
67      la = g.group(1)
68      id = g.group(2)
69      if la in ['node','segment','way']:
70         last = "%s %s" % (la,id)             # "node 1234", "segment 6543", ...
71         if buf[-1] == "/":                   # self closing element
72            output_buffer2()
73   else:
74      g = ema.match(buf)                      # regular expression to find end tag
75      if g:
76         la = g.group(1)
77         if la in ['node','segment','way']:
78            output_buffer2()
79   
80def procoutside(buf):
81   global buffer2
82   buffer2 += buf
83
84class bufferedread:         # I'm not sure if python does this anyway
85   def __init__(self,fnm):
86      self.buffer = None
87      self.buffsize = 10000
88      self.fi = open(fnm,"rb")
89      self.curpos = 0
90      self.maxpos = 0
91   
92   def readchar(self):
93      if self.curpos == self.maxpos:
94         self.buffer = self.fi.read(self.buffsize)
95         self.maxpos = len(self.buffer)
96         self.curpos = 0
97         if self.buffer == '':   # EOF
98            return None
99           
100      self.curpos += 1
101      return self.buffer[self.curpos-1]
102
103   def close(self):
104      self.fi.close()
105      self.fi = None
106     
107# uniform pattern in planet.osm
108# XYZ id='xyz'
109ma = re.compile("^(.+?)\sid='(.+?)'")
110ema = re.compile("^/(.+?)$")
111
112f = bufferedread(xmlfilename)
113x = f.readchar()
114buf = ""
115
116errout = open(errorfile,"wb")
117
118inbrac = False
119mode = 0
120while x != None:
121   c = ord(x)
122   if mode == 0:
123      # UTF-8 conditions
124      if (0x00 <= c) and (c <= 0x7f): 
125         pass  # ascii
126      elif (0xc0 <= c) and (c <= 0xdf):
127         mode = 1  # one octet follows
128      elif (c == 0xe0):
129         mode = 2  # two octets follow
130      elif (0xf0 <= c) and (c <= 0xf7):
131         mode = 3  # three byte follow
132      else:
133         error()
134   else:
135      if (c < 0x80) or (c>0xBF):
136         error()
137         mode = 0
138      else:
139         mode -= 1
140     
141   if inbrac:
142      if x == '>':
143         procinside(buf)
144         buf = ""
145         inbrac = False
146      else:
147         buf += x
148   else:
149      if x == '<':
150         procoutside(buf)
151         buf = ""
152         inbrac = True
153      else:
154         buf += x
155   x = f.readchar()
156f.close() 
157errout.close()
158   
159print counter, "errors found"
Note: See TracBrowser for help on using the repository browser.