source: subversion/applications/utils/planet.osm/C/UTF8sanitizer.c @ 2906

Last change on this file since 2906 was 2684, checked in by jonas, 13 years ago

Added license-comment

File size: 3.1 KB
Line 
1/*
2 * UTF8sanitizer.c - filter out invalid UTF8 bytesequences
3 *     
4 * Copyright (C) 2006 Jonas Svensson (jonass@lysator.liu.se)
5 *
6 * This program is free software; you can redistribute it and/or modify it under
7 * the terms of the GNU General Public License as published by the Free Software
8 * Foundation; either version 2 of the License, or (at your option) any later
9 * version.
10 *
11 * This program is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14 * details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
18 * Place - Suite 330, Boston, MA 02111-1307, USA.
19 * 
20 */
21
22/* To compile:
23   gcc -O2 -o UTF8sanitizer UTF8sanitizer.c
24
25   Usage (assuming bash):
26   UTF8sanitizer <sourcefile >destfile 2>errors.txt
27*/
28
29#include <stdio.h>
30
31int main(int argc, char** argv) {
32  long long line;
33  long long chars1, chars2, chars3, chars4, chars5, chars6;
34  int state, current_size;
35  int current_char, long_char[6];
36  int i;
37
38  chars1=chars2=chars3=chars4=chars5=chars6=0;
39  line = 0;
40  state = 1;
41  current_size=0;
42  current_char=getchar();
43  while (!feof(stdin)) {
44    if ((current_char & 128) == 0) {
45      //Handle_ASCII_char();
46      if (current_char == '\n') 
47        line++;
48      else
49        chars1++;
50      if (state != 1) {
51        fprintf(stderr, "Error at line %lld\n", line);
52        putchar('_');
53        state = 1;
54      }
55      putchar(current_char);
56    } else if ((current_char & (128+64)) == 128) {
57      // Handle_continue_char();
58      if(state > 1) {
59        state--;
60        if(state==1) {
61          // long char finished
62          for(i=1; i<current_size; i++) {
63            putchar(long_char[i-1]);
64          }
65          putchar(current_char);
66        }
67      } else {
68        fprintf(stderr, "Error at line %lld\n", line);
69        putchar('_');
70        state=1;
71      }
72    } else if ((current_char & (128+64+32)) == (128+64)) {
73      //Handle_two_bytes();
74      state=2;
75      chars2++;
76      current_size=2;
77    } else if ((current_char & (128+64+32+16)) == (128+64+32)) {
78      //Handle_three_bytes();
79      state=3;
80      chars3++;
81      current_size=3;
82    } else if ((current_char & (128+64+32+16+8)) == (128+64+32+16)) {
83      //Handle_four_bytes();
84      state=4;
85      chars4++;
86      current_size=4;
87    } else if ((current_char & (128+64+32+16+8+4)) == (128+64+32+16+8)) {
88      //Handle_five_bytes();
89      state=5;
90      chars5++;
91      current_size=5;
92    } else if ((current_char & (128+64+32+16+8+4+2)) == (128+64+32+16+8+4)) {
93      //Handle_six_bytes();
94      state=6;
95      chars6++;
96      current_size=6;
97    }
98    if(state>1) {
99      long_char[current_size-state]=current_char;
100    }
101    current_char=getchar();
102  }
103
104  fprintf(stderr, "Summary:\n");
105  fprintf(stderr, "chars1: %lld\n", chars1);
106  fprintf(stderr, "chars2: %lld\n", chars2);
107  fprintf(stderr, "chars3: %lld\n", chars3);
108  fprintf(stderr, "chars4: %lld\n", chars4);
109  fprintf(stderr, "chars5: %lld\n", chars5);
110  fprintf(stderr, "chars6: %lld\n", chars6);
111  fprintf(stderr, "lines : %lld\n", line);
112
113  return 0;
114}
Note: See TracBrowser for help on using the repository browser.