YAZ  5.31.0
charconv.tcl
Go to the documentation of this file.
1 #!/usr/bin/tclsh
2 
3 proc usage {} {
4  puts {charconv.tcl: [-p prefix] [-s split] [-o ofile] file ... }
5  exit 1
6 }
7 
8 proc preamble_trie {ofilehandle ifiles ofile} {
9  set f $ofilehandle
10 
11  set totype {unsigned }
12 
13  puts $f "/** \\file $ofile"
14  puts $f " \\brief Character conversion, generated from [lindex $ifiles 0]"
15  puts $f ""
16  puts $f " Generated automatically by charconv.tcl"
17  puts $f "*/"
18  puts $f "\#include <string.h>"
19 
20  puts $f "\#if HAVE_CONFIG_H"
21  puts $f "\#include <config.h>"
22  puts $f "\#endif"
23 
24  puts $f "
25  struct yaz_iconv_trie_flat {
26  char from\[6\];
27  unsigned combining : 1;
28  $totype to : 24;
29  };
30  struct yaz_iconv_trie_dir {
31  int ptr : 15;
32  unsigned combining : 1;
33  $totype to : 24;
34  };
35 
36  struct yaz_iconv_trie {
37  struct yaz_iconv_trie_flat *flat;
38  struct yaz_iconv_trie_dir *dir;
39  };
40  "
41  puts $f {
42  static unsigned long lookup(struct yaz_iconv_trie **ptrs, int ptr, unsigned char *inp,
43  size_t inbytesleft, size_t *no_read, int *combining, unsigned mask, unsigned int boffset)
44  {
45  struct yaz_iconv_trie *t = ptrs[ptr-1];
46  if (inbytesleft < 1)
47  return 0;
48  if (t->dir)
49  {
50  size_t ch = (inp[0] & mask) + boffset;
51  unsigned long code;
52  if (t->dir[ch].ptr)
53  {
54  code = lookup(ptrs, t->dir[ch].ptr, inp+1, inbytesleft-1, no_read, combining, mask, boffset);
55  if (code)
56  {
57  (*no_read)++;
58  return code;
59  }
60  }
61  if (t->dir[ch].to)
62  {
63  code = t->dir[ch].to;
64  *combining = t->dir[ch].combining;
65  *no_read = 1;
66  return code;
67  }
68  }
69  else
70  {
71  struct yaz_iconv_trie_flat *flat = t->flat;
72  while (flat->to)
73  {
74  size_t len = strlen(flat->from);
75  if (len <= inbytesleft)
76  {
77  size_t i;
78  for (i = 0; i < len; i++)
79  {
80  if (((unsigned char *) flat->from)[i] != (inp[i] & mask) + boffset)
81  break;
82  }
83  if (i == len)
84  {
85  *no_read = len;
86  *combining = flat->combining;
87  return flat->to;
88  }
89  }
90  flat++;
91  }
92  }
93  return 0;
94  }
95  }
96 }
97 
98 proc reset_trie {} {
99  global trie
100 
101  foreach x [array names trie] {
102  unset trie($x)
103  }
104 
105  set trie(no) 1
106  set trie(size) 0
107  set trie(max) 0
108  set trie(split) 50
109  set trie(prefix) {}
110 }
111 
112 proc ins_trie {from to combining codename} {
113  global trie
114  if {![info exists trie(no)]} {
115  set trie(no) 1
116  set trie(size) 0
117  set trie(max) 0
118  }
119  if {$trie(max) < $to} {
120  set trie(max) $to
121  }
122  incr trie(size)
123  ins_trie_r [split $from] $to $combining $codename 0
124 }
125 
126 proc split_trie {this} {
127  global trie
128  set trie($this,type) d
129  foreach e $trie($this,content) {
130  set from [lindex $e 0]
131  set to [lindex $e 1]
132  set combining [lindex $e 2]
133  set codename [lindex $e 3]
134 
135  set ch [lindex $from 0]
136  set rest [lrange $from 1 end]
137 
138  if {[llength $rest]} {
139  if {![info exist trie($this,ptr,$ch)]} {
140  set trie($this,ptr,$ch) $trie(no)
141  incr trie(no)
142  }
143  ins_trie_r $rest $to $combining $codename $trie($this,ptr,$ch)
144  } else {
145  set trie($this,to,$ch) $to
146  set trie($this,combining,$ch) $combining
147  set trie($this,codename,$ch) $codename
148  }
149  }
150  set trie($this,content) missing
151 }
152 
153 proc ins_trie_r {from to combining codename this} {
154  global trie
155 
156  if {![info exist trie($this,type)]} {
157  set trie($this,type) f
158  }
159  if {$trie($this,type) == "f"} {
160  set dup 0
161  if {[info exists trie($this,content)]} {
162  foreach e $trie($this,content) {
163  set efrom [lindex $e 0]
164  if { $efrom == $from } {
165  set dup 1
166  }
167  }
168  }
169  if { $dup == 0 } {
170  lappend trie($this,content) [list $from $to $combining $codename]
171  }
172 
173  # split ?
174  if {[llength $trie($this,content)] > $trie(split)} {
175  split_trie $this
176  return [ins_trie_r $from $to $combining $codename $this]
177  }
178  } else {
179  set ch [lindex $from 0]
180  set rest [lrange $from 1 end]
181 
182  if {[llength $rest]} {
183  if {![info exist trie($this,ptr,$ch)]} {
184  set trie($this,ptr,$ch) $trie(no)
185  incr trie(no)
186  }
187  ins_trie_r $rest $to $combining $codename $trie($this,ptr,$ch)
188  } else {
189  if {![info exist trie($this,to,$ch)]} {
190  set trie($this,to,$ch) $to
191  set trie($this,combining,$ch) $combining
192  set trie($this,codename,$ch) $codename
193  }
194  }
195  }
196 }
197 
198 proc dump_trie {ofilehandle} {
199  global trie
200 
201  set f $ofilehandle
202 
203  puts $f "/* TRIE: size $trie(size) */"
204 
205  set this $trie(no)
206  while { [incr this -1] >= 0 } {
207  puts $f "/* PAGE $this */"
208  if {$trie($this,type) == "f"} {
209  puts $f "struct yaz_iconv_trie_flat $trie(prefix)page${this}_flat\[\] = \{"
210  foreach m $trie($this,content) {
211  puts -nonewline $f " \{\""
212  foreach d [lindex $m 0] {
213  puts -nonewline $f "\\x$d"
214  }
215  puts -nonewline $f "\", [lindex $m 2], 0x[lindex $m 1]"
216  set v [lindex $m 3]
217  puts $f "\}, /* $v */"
218  }
219  puts $f " \{\"\", 0, 0\}"
220  puts $f "\};"
221  puts $f "struct yaz_iconv_trie $trie(prefix)page${this} = \{"
222  puts $f " $trie(prefix)page${this}_flat, 0"
223  puts $f "\};"
224  } else {
225  puts $f "struct yaz_iconv_trie_dir $trie(prefix)page${this}_dir\[256\] = \{"
226  for {set i 0} {$i < 256} {incr i} {
227  puts -nonewline $f " \{"
228  set ch [format %02X $i]
229  set null 1
230  if {[info exist trie($this,ptr,$ch)]} {
231  puts -nonewline $f "[expr $trie($this,ptr,$ch)+1], "
232  set null 0
233  } else {
234  puts -nonewline $f "0, "
235  }
236  if {[info exist trie($this,combining,$ch)]} {
237  puts -nonewline $f "$trie($this,combining,$ch), "
238  } else {
239  puts -nonewline $f "0, "
240  }
241  if {[info exist trie($this,to,$ch)]} {
242  puts -nonewline $f "0x$trie($this,to,$ch)\}"
243  set null 0
244  } else {
245  puts -nonewline $f "0\}"
246  }
247  if {[info exist trie($this,codename,$ch)]} {
248  set v $trie($this,codename,$ch)
249  puts -nonewline $f " /* $v */"
250  }
251  if {$i < 255} {
252  puts $f ","
253  } else {
254  puts $f ""
255  }
256  }
257  puts $f "\};"
258  puts $f "struct yaz_iconv_trie $trie(prefix)page${this} = \{"
259  puts $f " 0, $trie(prefix)page${this}_dir"
260  puts $f "\};"
261  }
262  }
263 
264  puts $f "struct yaz_iconv_trie *$trie(prefix)ptrs \[\] = {"
265  for {set this 0} {$this < $trie(no)} {incr this} {
266  puts $f " &$trie(prefix)page$this,"
267  }
268  puts $f "0, };"
269  puts $f ""
270 
271  puts $f "unsigned long yaz_$trie(prefix)_conv
272  (unsigned char *inp, size_t inbytesleft, size_t *no_read, int *combining, unsigned mask, unsigned int boffset)
273  {
274  unsigned long code;
275 
276  code = lookup($trie(prefix)ptrs, 1, inp, inbytesleft, no_read, combining, mask, boffset);
277  if (!code)
278  {
279  *no_read = 1;
280  }
281  return code;
282  }
283  "
284 }
285 
286 proc readfile {fname ofilehandle prefix omits reverse} {
287  global trie
288 
289  set marc_lines 0
290  set ucs_lines 0
291  set utf_lines 0
292  set altutf_lines 0
293  set codename_lines 0
294  set lineno 0
295  set f [open $fname r]
296  set tablenumber x
297  set combining 0
298  set codename {}
299  set altutf {}
300  while {1} {
301  incr lineno
302  set cnt [gets $f line]
303  if {$cnt < 0} {
304  break
305  }
306  if {[regexp {</characterSet>} $line s]} {
307  dump_trie $ofilehandle
308  } elseif {[regexp {<characterSet .*ISOcode="([0-9A-Fa-f]+)"} $line s tablenumber]} {
309  reset_trie
310  set trie(prefix) "${prefix}_$tablenumber"
311  set combining 0
312  } elseif {[regexp {</code>} $line s]} {
313  if {[string length $ucs]} {
314  if {$reverse} {
315  for {set i 0} {$i < [string length $utf]} {incr i 2} {
316  lappend hex [string range $utf $i [expr $i+1]]
317  }
318  # puts "ins_trie $hex $marc
319  ins_trie $hex $marc $combining $codename
320  unset hex
321 
322  } else {
323  for {set i 0} {$i < [string length $marc]} {incr i 2} {
324  lappend hex [string range $marc $i [expr $i+1]]
325  }
326  # puts "ins_trie $hex $ucs"
327  ins_trie $hex $ucs $combining $codename
328  unset hex
329  }
330  }
331  if {$reverse && [string length $marc]} {
332  for {set i 0} {$i < [string length $altutf]} {incr i 2} {
333  lappend hex [string range $altutf $i [expr $i+1]]
334  }
335  if {[info exists hex]} {
336  ins_trie $hex $marc $combining $codename
337  unset hex
338  }
339  }
340  set marc {}
341  set uni {}
342  set codename {}
343  set combining 0
344  set altutf {}
345  } elseif {[regexp {<marc>([0-9A-Fa-f]*)</marc>} $line s marc]} {
346  incr marc_lines
347  } elseif {[regexp {<name>(.*)</name>} $line s codename]} {
348  incr codename_lines
349  } elseif {[regexp {<name>(.*)} $line s codename]} {
350  incr codename_lines
351  incr lineno
352  set cnt [gets $f line]
353  if {$cnt < 0} {
354  break
355  }
356  if {[regexp {(.*)</name>} $line s codename_ex]} {
357  set codename "${codename} ${codename_ex}"
358  }
359  } elseif {[regexp {<isCombining>true</isCombining>} $line s]} {
360  set combining 1
361  } elseif {[regexp {<ucs>([0-9A-Fa-f]*)</ucs>} $line s ucs]} {
362  incr ucs_lines
363  } elseif {[regexp {<utf-8>([0-9A-Fa-f]*)</utf-8>} $line s utf]} {
364  incr utf_lines
365  } elseif {[regexp {<altutf-8>([0-9A-Fa-f]*)</altutf-8>} $line s altutf]} {
366  incr altutf_lines
367  }
368  }
369  close $f
370 }
371 
372 set verbose 0
373 set ifile {}
374 set ofile out.c
375 set prefix {c}
376 set reverse_map 0
377 # Parse command line
378 set l [llength $argv]
379 set i 0
380 set omits {}
381 while {$i < $l} {
382  set arg [lindex $argv $i]
383  switch -glob -- $arg {
384  -v {
385  incr verbose
386  }
387  -s {
388  if {[string length $arg]} {
389  set arg [lindex $argv [incr i]]
390  }
391  set trie(split) $arg
392  }
393  -p {
394  if {[string length $arg]} {
395  set arg [lindex $argv [incr i]]
396  }
397  set prefix $arg
398  }
399  -o {
400  if {[string length $arg]} {
401  set arg [lindex $argv [incr i]]
402  }
403  set ofile $arg
404  }
405  -O {
406  if {[string length $arg]} {
407  set arg [lindex $argv [incr i]]
408  }
409  lappend omits $arg
410  }
411  -r {
412  set reverse_map 1
413  }
414  default {
415  lappend ifiles $arg
416  }
417  }
418  incr i
419 }
420 if {![info exists ifiles]} {
421  puts "charconv.tcl: missing input file(s)"
422  usage
423 }
424 
425 set ofilehandle [open ${ofile}.tmp w]
426 preamble_trie $ofilehandle $ifiles $ofile
427 
428 foreach ifile $ifiles {
429  readfile $ifile $ofilehandle $prefix $omits $reverse_map
430 }
431 close $ofilehandle
432 
433 file rename -force ${ofile}.tmp ${ofile}
434 
435