triehash.pl 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661
  1. #!/usr/bin/perl -w
  2. #
  3. # Copyright (C) 2016 Julian Andres Klode <jak@jak-linux.org>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  21. # THE SOFTWARE.
  22. =head1 NAME
  23. triehash - Generate a perfect hash function derived from a trie.
  24. =cut
  25. use strict;
  26. use warnings;
  27. use Getopt::Long;
  28. =head1 SYNOPSIS
  29. B<triehash> [S<I<option>>] [S<I<input file>>]
  30. =head1 DESCRIPTION
  31. triehash takes a list of words in input file and generates a function and
  32. an enumeration to describe the word
  33. =head1 INPUT FILE FORMAT
  34. The file consists of multiple lines of the form:
  35. [label ~ ] word [= value]
  36. This maps word to value, and generates an enumeration with entries of the form:
  37. label = value
  38. If I<label> is undefined, the word will be used, the minus character will be
  39. replaced by an underscore. If value is undefined it is counted upwards from
  40. the last value.
  41. There may also be one line of the format
  42. [ label ~] = value
  43. Which defines the value to be used for non-existing keys. Note that this also
  44. changes default value for other keys, as for normal entries. So if you place
  45. = 0
  46. at the beginning of the file, unknown strings map to 0, and the other strings
  47. map to values starting with 1. If label is not specified, the default is
  48. I<Unknown>.
  49. =head1 OPTIONS
  50. =over 4
  51. =item B<-C>I<.c file> B<--code>=I<.c file>
  52. Generate code in the given file.
  53. =item B<-H>I<header file> B<--header>=I<header file>
  54. Generate a header in the given file, containing a declaration of the hash
  55. function and an enumeration.
  56. =item B<--enum-name=>I<word>
  57. The name of the enumeration.
  58. =item B<--function-name=>I<word>
  59. The name of the function.
  60. =item B<--namespace=>I<name>
  61. Put the function and enum into a namespace (C++)
  62. =item B<--class=>I<name>
  63. Put the function and enum into a class (C++)
  64. =item B<--enum-class>
  65. Generate an enum class instead of an enum (C++)
  66. =item B<--counter-name=>I<name>
  67. Use I<name> for a counter that is set to the latest entry in the enumeration
  68. + 1. This can be useful for defining array sizes.
  69. =item B<--extern-c>
  70. Wrap everything into an extern "C" block. Not compatible with the C++
  71. options, as a header with namespaces, classes, or enum classes is not
  72. valid C.
  73. =item B<--multi-byte>=I<value>
  74. Generate code reading multiple bytes at once. The value is a string of power
  75. of twos to enable. The default value is 320 meaning that 8, 4, and single byte
  76. reads are enabled. Specify 0 to disable multi-byte completely, or add 2 if you
  77. also want to allow 2-byte reads. 2-byte reads are disabled by default because
  78. they negatively affect performance on older Intel architectures.
  79. This generates code for both multiple bytes and single byte reads, but only
  80. enables the multiple byte reads of GNU C compatible compilers, as the following
  81. extensions are used:
  82. =over 8
  83. =item Byte-aligned integers
  84. We must be able to generate integers that are aligned to a single byte using:
  85. typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
  86. =item Byte-order
  87. The macros __BYTE_ORDER__ and __ORDER_LITTLE_ENDIAN__ must be defined.
  88. =back
  89. We forcefully disable multi-byte reads on platforms where the variable
  90. I<__ARM_ARCH> is defined and I<__ARM_FEATURE_UNALIGNED> is not defined,
  91. as there is a measurable overhead from emulating the unaligned reads on
  92. ARM.
  93. =item B<--language=>I<language>
  94. Generate a file in the specified language. Currently known are 'C' and 'tree',
  95. the latter generating a tree.
  96. =item B<--include=>I<header>
  97. Add the header to the include statements of the header file. The value must
  98. be surrounded by quotes or angle brackets for C code. May be specified multiple
  99. times.
  100. =back
  101. =cut
  102. my $unknown = -1;
  103. my $unknown_label = "Unknown";
  104. my $counter_start = 0;
  105. my $enum_name = "PerfectKey";
  106. my $function_name = "PerfectHash";
  107. my $enum_class = 0;
  108. my $code_name = "-";
  109. my $header_name = "-";
  110. my $code;
  111. my $header;
  112. my $ignore_case = 0;
  113. my $multi_byte = "320";
  114. my $language = 'C';
  115. my $counter_name = undef;
  116. my @includes = ();
  117. Getopt::Long::config('default',
  118. 'bundling',
  119. 'no_getopt_compat',
  120. 'no_auto_abbrev',
  121. 'permute',
  122. 'auto_help');
  123. GetOptions ("code|C=s" => \$code_name,
  124. "header|H=s" => \$header_name,
  125. "function-name=s" => \$function_name,
  126. "ignore-case" => \$ignore_case,
  127. "enum-name=s" => \$enum_name,
  128. "language|l=s" => \$language,
  129. "multi-byte=s" => \$multi_byte,
  130. "enum-class" => \$enum_class,
  131. "include=s" => \@includes,
  132. "counter-name=s" => \$counter_name)
  133. or die("Could not parse options!");
  134. package Trie {
  135. sub new {
  136. my $class = shift;
  137. my $self = {};
  138. bless $self, $class;
  139. $self->{children} = {};
  140. $self->{value} = undef;
  141. $self->{label} = undef;
  142. return $self;
  143. }
  144. # Return the largest power of 2 smaller or equal to the argument
  145. sub alignpower2 {
  146. my ($self, $length) = @_;
  147. return 8 if ($length >= 8 && $multi_byte =~ /3/);
  148. return 4 if ($length >= 4 && $multi_byte =~ /2/);
  149. return 2 if ($length >= 2 && $multi_byte =~ /1/);
  150. return 1;
  151. }
  152. # Split the key into a head block and a tail
  153. sub split_key {
  154. my ($self, $key) = @_;
  155. my $length = length $key;
  156. my $split = $self->alignpower2($length);
  157. return (substr($key, 0, $split), substr($key, $split));
  158. }
  159. sub insert {
  160. my ($self, $key, $label, $value) = @_;
  161. if (length($key) == 0) {
  162. $self->{label} = $label;
  163. $self->{value} = $value;
  164. return;
  165. }
  166. my ($child, $tail) = $self->split_key($key);
  167. $self->{children}{$child} = Trie->new if (!defined($self->{children}{$child}));
  168. $self->{children}{$child}->insert($tail, $label, $value);
  169. }
  170. sub filter_depth {
  171. my ($self, $togo) = @_;
  172. my $new = Trie->new;
  173. if ($togo != 0) {
  174. my $found = 0;
  175. foreach my $key (sort keys %{$self->{children}}) {
  176. if ($togo > length($key) || defined $self->{children}{$key}->{value}) {
  177. my $child = $self->{children}{$key}->filter_depth($togo - length($key));
  178. $new->{children}{$key}= $child if defined $child;
  179. $found = 1 if defined $child;
  180. }
  181. }
  182. return undef if (!$found);
  183. } else {
  184. $new->{value} = $self->{value};
  185. $new->{label} = $self->{label};
  186. }
  187. return $new;
  188. }
  189. # Reinsert all value nodes into the specified $trie, prepending $prefix
  190. # to their $paths.
  191. sub reinsert_value_nodes_into {
  192. my ($self, $trie, $prefix) = @_;
  193. $trie->insert($prefix, $self->{label}, $self->{value}) if (defined $self->{value});
  194. foreach my $key (sort keys %{$self->{children}}) {
  195. $self->{children}{$key}->reinsert_value_nodes_into($trie, $prefix . $key);
  196. }
  197. }
  198. # Find an earlier split due a an ambiguous character
  199. sub find_ealier_split {
  200. my ($self, $key) = @_;
  201. if ($ignore_case) {
  202. for my $i (0..length($key)-1) {
  203. # If the key starts with an ambiguous character, we need to
  204. # take only it. Otherwise, we need to take everything
  205. # before the character.
  206. return $self->alignpower2($i || 1) if (main::ambiguous(substr($key, $i, 1)));
  207. }
  208. }
  209. return $self->alignpower2(length $key);
  210. }
  211. # Rebuild the trie, splitting at ambigous chars, and unifying key lengths
  212. sub rebuild_tree {
  213. my $self = shift;
  214. # Determine if/where we need to split before an ambiguous character
  215. my $new_split = 99999999999999999;
  216. foreach my $key (sort keys %{$self->{children}}) {
  217. my $special_length = $self->find_ealier_split($key);
  218. $new_split = $special_length if ($special_length < $new_split);
  219. }
  220. # Start building a new uniform trie
  221. my $newself = Trie->new;
  222. $newself->{label} = $self->{label};
  223. $newself->{value} = $self->{value};
  224. $newself->{children} = {};
  225. foreach my $key (sort keys %{$self->{children}}) {
  226. my $head = substr($key, 0, $new_split);
  227. my $tail = substr($key, $new_split);
  228. # Rebuild the child node at $head, pushing $tail downwards
  229. $newself->{children}{$head} //= Trie->new;
  230. $self->{children}{$key}->reinsert_value_nodes_into($newself->{children}{$head}, $tail);
  231. # We took up to one special character of each key label. There might
  232. # be more, so we need to rebuild recursively.
  233. $newself->{children}{$head} = $newself->{children}{$head}->rebuild_tree();
  234. }
  235. return $newself;
  236. }
  237. }
  238. # Code generator for C and C++
  239. package CCodeGen {
  240. my $static = ($code_name eq $header_name) ? "static" : "";
  241. my $enum_specifier = $enum_class ? "enum class" : "enum";
  242. sub new {
  243. my $class = shift;
  244. my $self = {};
  245. bless $self, $class;
  246. return $self;
  247. }
  248. sub open_output {
  249. my $self = shift;
  250. if ($code_name ne "-") {
  251. open($code, '>', $code_name) or die "Cannot open $code_name: $!" ;
  252. } else {
  253. $code = *STDOUT;
  254. }
  255. if($code_name eq $header_name) {
  256. $header = $code;
  257. } elsif ($header_name ne "-") {
  258. open($header, '>', $header_name) or die "Cannot open $header_name: $!" ;
  259. } else {
  260. $header = *STDOUT;
  261. }
  262. }
  263. sub word_to_label {
  264. my ($class, $word) = @_;
  265. $word =~ s/_/__/g;
  266. $word =~ s/-/_/g;
  267. return $word;
  268. }
  269. # Return a case label, by shifting and or-ing bytes in the word
  270. sub case_label {
  271. my ($self, $key) = @_;
  272. return sprintf("'%s'", substr($key, 0, 1)) if not $multi_byte;
  273. my $output = '0';
  274. for my $i (0..length($key)-1) {
  275. $output .= sprintf("| onechar('%s', %d, %d)", substr($key, $i, 1), 8 * $i, 8*length($key));
  276. }
  277. return $output;
  278. }
  279. # Return an appropriate read instruction for $length bytes from $offset
  280. sub switch_key {
  281. my ($self, $offset, $length) = @_;
  282. return "string[$offset]" if $length == 1;
  283. return sprintf("*((triehash_uu%s*) &string[$offset])", $length * 8);
  284. }
  285. sub print_table {
  286. my ($self, $trie, $fh, $indent, $index) = @_;
  287. $indent //= 0;
  288. $index //= 0;
  289. if (defined $trie->{value}) {
  290. printf $fh (" " x $indent . "return %s;\n", ($enum_class ? "${enum_name}::" : "").$trie->{label});
  291. return;
  292. }
  293. # The difference between lowercase and uppercase alphabetical characters
  294. # is that they have one bit flipped. If we have alphabetical characters
  295. # in the search space, and the entire search space works fine if we
  296. # always turn on the flip, just OR the character we are switching over
  297. # with the bit.
  298. my $want_use_bit = 0;
  299. my $can_use_bit = 1;
  300. my $key_length = 0;
  301. foreach my $key (sort keys %{$trie->{children}}) {
  302. $can_use_bit &= not main::ambiguous($key);
  303. $want_use_bit |= ($key =~ /^[a-zA-Z]+$/);
  304. $key_length = length($key);
  305. }
  306. if ($ignore_case && $can_use_bit && $want_use_bit) {
  307. printf $fh ((" " x $indent) . "switch(%s | 0x%s) {\n", $self->switch_key($index, $key_length), "20" x $key_length);
  308. } else {
  309. printf $fh ((" " x $indent) . "switch(%s) {\n", $self->switch_key($index, $key_length));
  310. }
  311. my $notfirst = 0;
  312. foreach my $key (sort keys %{$trie->{children}}) {
  313. if ($notfirst) {
  314. printf $fh (" " x $indent . " break;\n");
  315. }
  316. if ($ignore_case) {
  317. printf $fh (" " x $indent . "case %s:\n", $self->case_label(lc($key)));
  318. printf $fh (" " x $indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit);
  319. } else {
  320. printf $fh (" " x $indent . "case %s:\n", $self->case_label($key));
  321. }
  322. $self->print_table($trie->{children}{$key}, $fh, $indent + 1, $index + length($key));
  323. $notfirst=1;
  324. }
  325. printf $fh (" " x $indent . "}\n");
  326. }
  327. sub print_words {
  328. my ($self, $trie, $fh, $indent, $sofar) = @_;
  329. $indent //= 0;
  330. $sofar //= "";
  331. printf $fh (" " x $indent."%s = %s,\n", $trie->{label}, $trie->{value}) if defined $trie->{value};
  332. foreach my $key (sort keys %{$trie->{children}}) {
  333. $self->print_words($trie->{children}{$key}, $fh, $indent, $sofar . $key);
  334. }
  335. }
  336. sub print_functions {
  337. my ($self, $trie, %lengths) = @_;
  338. foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
  339. print $code ("static enum ${enum_name} ${function_name}${local_length}(const char *string)\n");
  340. print $code ("{\n");
  341. $self->print_table($trie->filter_depth($local_length)->rebuild_tree(), $code, 1);
  342. printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
  343. print $code ("}\n");
  344. }
  345. }
  346. sub main {
  347. my ($self, $trie, $num_values, %lengths) = @_;
  348. print $header ("#ifndef TRIE_HASH_${function_name}\n");
  349. print $header ("#define TRIE_HASH_${function_name}\n");
  350. print $header ("#include <stddef.h>\n");
  351. print $header ("#include <stdint.h>\n");
  352. foreach my $include (@includes) {
  353. print $header ("#include $include\n");
  354. }
  355. printf $header ("enum { $counter_name = $num_values };\n") if (defined($counter_name));
  356. print $header ("${enum_specifier} ${enum_name} {\n");
  357. $self->print_words($trie, $header, 1);
  358. printf $header (" $unknown_label = $unknown,\n");
  359. print $header ("};\n");
  360. print $header ("$static enum ${enum_name} ${function_name}(const char *string, size_t length);\n");
  361. print $code ("#include \"$header_name\"\n") if ($header_name ne $code_name);
  362. if ($multi_byte) {
  363. print $code ("#ifdef __GNUC__\n");
  364. for (my $i=16; $i <= 64; $i *= 2) {
  365. print $code ("typedef uint${i}_t __attribute__((aligned (1))) triehash_uu${i};\n");
  366. print $code ("typedef char static_assert${i}[__alignof__(triehash_uu${i}) == 1 ? 1 : -1];\n");
  367. }
  368. print $code ("#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n");
  369. print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (s))\n");
  370. print $code ("#else\n");
  371. print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))\n");
  372. print $code ("#endif\n");
  373. print $code ("#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)\n");
  374. print $code ("#define TRIE_HASH_MULTI_BYTE\n");
  375. print $code ("#endif\n");
  376. print $code ("#endif /*GNUC */\n");
  377. print $code ("#ifdef TRIE_HASH_MULTI_BYTE\n");
  378. $self->print_functions($trie, %lengths);
  379. $multi_byte = 0;
  380. print $code ("#else\n");
  381. $self->print_functions($trie, %lengths);
  382. print $code ("#endif /* TRIE_HASH_MULTI_BYTE */\n");
  383. } else {
  384. $self->print_functions($trie, %lengths);
  385. }
  386. print $code ("$static enum ${enum_name} ${function_name}(const char *string, size_t length)\n");
  387. print $code ("{\n");
  388. print $code (" switch (length) {\n");
  389. foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
  390. print $code (" case $local_length:\n");
  391. print $code (" return ${function_name}${local_length}(string);\n");
  392. }
  393. print $code (" default:\n");
  394. printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
  395. print $code (" }\n");
  396. print $code ("}\n");
  397. # Print end of header here, in case header and code point to the same file
  398. print $header ("#endif /* TRIE_HASH_${function_name} */\n");
  399. }
  400. }
  401. # Check if the word can be reached by exactly one word in (alphabet OR 0x20).
  402. sub ambiguous {
  403. my $word = shift;
  404. foreach my $char (split //, $word) {
  405. # Setting the lowercase flag in the character produces a different
  406. # character, the character would thus not be matched.
  407. return 1 if ((ord($char) | 0x20) != ord(lc($char)));
  408. # A word is also ambiguous if any character in lowercase can be reached
  409. # by ORing 0x20 from another character in the charset that is not a
  410. # lowercase character of the current character.
  411. # Assume that we have UTF-8 and the most significant bit can be set
  412. for my $i (0..255) {
  413. return 1 if (($i | 0x20) == ord(lc($char)) && lc(chr($i)) ne lc($char));
  414. }
  415. }
  416. return 0;
  417. }
  418. sub build_trie {
  419. my $codegen = shift;
  420. my $trie = Trie->new;
  421. my $counter = $counter_start;
  422. my %lengths;
  423. open(my $input, '<', $ARGV[0]) or die "Cannot open ".$ARGV[0].": $!";
  424. while (my $line = <$input>) {
  425. my ($label, $word, $value) = $line =~/\s*(?:([^~\s]+)\s*~)?(?:\s*([^~=\s]+)\s*)?(?:=\s*([^\s]+)\s+)?\s*/;
  426. if (defined $word) {
  427. $counter = $value if defined($value);
  428. $label //= $codegen->word_to_label($word);
  429. $trie->insert($word, $label, $counter);
  430. $lengths{length($word)} = 1;
  431. $counter++;
  432. } elsif (defined $value) {
  433. $unknown = $value;
  434. $unknown_label = $label if defined($label);
  435. $counter = $value + 1;
  436. } else {
  437. die "Invalid line: $line";
  438. }
  439. }
  440. return ($trie, $counter, %lengths);
  441. }
  442. # Generates an ASCII art tree
  443. package TreeCodeGen {
  444. sub new {
  445. my $class = shift;
  446. my $self = {};
  447. bless $self, $class;
  448. return $self;
  449. }
  450. sub word_to_label {
  451. my ($self, $word) = @_;
  452. return $word;
  453. }
  454. sub main {
  455. my ($self, $trie, $counter, %lengths) = @_;
  456. printf $code ("┌────────────────────────────────────────────────────┐\n");
  457. printf $code ("│ Initial trie │\n");
  458. printf $code ("└────────────────────────────────────────────────────┘\n");
  459. $self->print($trie);
  460. printf $code ("┌────────────────────────────────────────────────────┐\n");
  461. printf $code ("│ Rebuilt trie │\n");
  462. printf $code ("└────────────────────────────────────────────────────┘\n");
  463. $self->print($trie->rebuild_tree());
  464. foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
  465. printf $code ("┌────────────────────────────────────────────────────┐\n");
  466. printf $code ("│ Trie for words of length %-4d │\n", $local_length);
  467. printf $code ("└────────────────────────────────────────────────────┘\n");
  468. $self->print($trie->filter_depth($local_length)->rebuild_tree());
  469. }
  470. }
  471. sub open_output {
  472. my $self = shift;
  473. if ($code_name ne "-") {
  474. open($code, '>', $code_name) or die "Cannot open ".$ARGV[0].": $!" ;
  475. } else {
  476. $code = *STDOUT;
  477. }
  478. }
  479. # Print a trie
  480. sub print {
  481. my ($self, $trie, $depth) = @_;
  482. $depth //= 0;
  483. print $code (" → ") if defined($trie->{label});
  484. print $code ($trie->{label} // "", "\n");
  485. foreach my $key (sort keys %{$trie->{children}}) {
  486. print $code ("│ " x ($depth), "├── $key");
  487. $self->print($trie->{children}{$key}, $depth + 1);
  488. }
  489. }
  490. }
  491. my %codegens = (
  492. C => "CCodeGen",
  493. tree => "TreeCodeGen",
  494. );
  495. defined($codegens{$language}) or die "Unknown language $language. Valid choices: ", join(", ", keys %codegens);
  496. my $codegen = $codegens{$language}->new();
  497. my ($trie, $counter, %lengths) = build_trie($codegen);
  498. $codegen->open_output();
  499. $codegen->main($trie, $counter, %lengths);
  500. =head1 LICENSE
  501. triehash is available under the MIT/Expat license, see the source code
  502. for more information.
  503. =head1 AUTHOR
  504. Julian Andres Klode <jak@jak-linux.org>
  505. =cut