#!/usr/bin/env perl # jamo-normalize.pl # # Copyright (c) 2003-2015 Dohyun Kim # # This work may be distributed and/or modified under the # conditions of the LaTeX Project Public License, either version 1.3c # of this license or (at your option) any later version. # The latest version of this license is in # http://www.latex-project.org/lppl.txt # and version 1.3c or later is part of all distributions of LaTeX # version 2006/05/20 or later. # # written by Dohyun Kim # public domain # #use strict; #use warnings; my %OPT; while (@ARGV) { my $opt = shift @ARGV; if ($opt =~ /-b/i) { $OPT{boundary} = 1 } elsif ($opt =~ /-p/i) { $OPT{topua} = 1 } elsif ($opt =~ /-d/i) { $OPT{decompose} = 1 } elsif ($opt =~ /-o/i) { $OPT{frompua} = 1 } elsif ($opt =~ /-t/i) { $OPT{latintm} = 1 } elsif ($opt =~ /-r/i) { $OPT{reordertm} = 1 } elsif ($opt =~ /-i/i) { $OPT{normalhanja} = 1 } elsif ($opt =~ /-c/i) { $OPT{compatjamo} = 1 } else { (my $prog = $0) =~ s/.*[\/\\]//; print "Usage: $prog [options] < in_file > out_file\n\n", " Translate Hangul Jamo sequence to Hangul syllables\n\n", " -b : insert ZWS between syllable blocks (not for practical use)\n", " -c : convert conjoining Jamo to compatibility Jamo\n", " -d : decomposition only, and no further recomposition\n", " -i : convert compatibility Hanja to normal Hanja\n", " -o : decompose PUA Old Hangul syllables to Jamo sequence\n", " -p : compose Jamo sequence to PUA Old Hangul syllables\n", " -r : reorder Hangul Tone Marks to the first of syllable block\n", " (not for practical use)\n", " -t : convert U+00B7 or U+003A to Hangul Tone Marks\n"; exit; } } ##### variables ##### my $cho = "\x{1100}-\x{115F}\x{A960}-\x{A97C}"; my $jung = "\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}"; my $jong = "\x{11A8}-\x{11FF}\x{D7CB}-\x{D7FB}"; my $tmrk = "\x{302E}\x{302F}"; my $boundary = "\x{200B}"; my $syllblock = "[$cho][$jung][$jong]?[$tmrk]?"; my $jamos = "[$cho$jung$jong$tmrk]"; my %jamo2cjamo = ( 0x1100 => 0x3131, 0x1101 => 0x3132, 0x1102 => 0x3134, 0x1103 => 0x3137, 0x1104 => 0x3138, 0x1105 => 0x3139, 0x1106 => 0x3141, 0x1107 => 0x3142, 0x1108 => 0x3143, 0x1109 => 0x3145, 0x110A => 0x3146, 0x110B => 0x3147, 0x110C => 0x3148, 0x110D => 0x3149, 0x110E => 0x314A, 0x110F => 0x314B, 0x1110 => 0x314C, 0x1111 => 0x314D, 0x1112 => 0x314E, 0x1114 => 0x3165, 0x1115 => 0x3166, 0x111A => 0x3140, 0x111C => 0x316E, 0x111D => 0x3171, 0x111E => 0x3172, 0x1120 => 0x3173, 0x1121 => 0x3144, 0x1122 => 0x3174, 0x1123 => 0x3175, 0x1127 => 0x3176, 0x1129 => 0x3177, 0x112B => 0x3178, 0x112C => 0x3179, 0x112D => 0x317A, 0x112E => 0x317B, 0x112F => 0x317C, 0x1132 => 0x317D, 0x1136 => 0x317E, 0x1140 => 0x317F, # 0x1145 => 0x3182, 0x1146 => 0x3183, 0x1147 => 0x3180, 0x114C => 0x3181, 0x1157 => 0x3184, 0x1158 => 0x3185, 0x1159 => 0x3186, 0x115B => 0x3167, 0x115C => 0x3135, 0x115D => 0x3136, 0x1161 => 0x314F, 0x1162 => 0x3150, 0x1163 => 0x3151, 0x1164 => 0x3152, 0x1165 => 0x3153, 0x1166 => 0x3154, 0x1167 => 0x3155, 0x1168 => 0x3156, 0x1169 => 0x3157, 0x116A => 0x3158, 0x116B => 0x3159, 0x116C => 0x315A, 0x116D => 0x315B, 0x116E => 0x315C, 0x116F => 0x315D, 0x1170 => 0x315E, 0x1171 => 0x315F, 0x1172 => 0x3160, 0x1173 => 0x3161, 0x1174 => 0x3162, 0x1175 => 0x3163, 0x1184 => 0x3187, 0x1185 => 0x3188, 0x1188 => 0x3189, 0x1191 => 0x318A, 0x1192 => 0x318B, 0x1194 => 0x318C, 0x119E => 0x318D, 0x11A1 => 0x318E, # 0x11A8 => 0x3131, 0x11A9 => 0x3132, 0x11AA => 0x3133, 0x11AB => 0x3134, # 0x11AC => 0x3135, 0x11AD => 0x3136, 0x11AE => 0x3137, 0x11AF => 0x3139, # 0x11B0 => 0x313A, 0x11B1 => 0x313B, 0x11B2 => 0x313C, 0x11B3 => 0x313D, # 0x11B4 => 0x313E, 0x11B5 => 0x313F, 0x11B6 => 0x3140, 0x11B7 => 0x3141, # 0x11B8 => 0x3142, 0x11B9 => 0x3144, 0x11BA => 0x3145, 0x11BB => 0x3146, # 0x11BC => 0x3147, 0x11BD => 0x3148, 0x11BE => 0x314A, 0x11BF => 0x314B, # 0x11C0 => 0x314C, 0x11C1 => 0x314D, 0x11C2 => 0x314E, 0x11C6 => 0x3166, # 0x11C7 => 0x3167, 0x11C8 => 0x3168, 0x11CC => 0x3169, 0x11CE => 0x316A, # 0x11D3 => 0x316B, 0x11D7 => 0x316C, 0x11D9 => 0x316D, 0x11DC => 0x316E, # 0x11DD => 0x316F, 0x11DF => 0x3170, 0x11E2 => 0x3171, 0x11E6 => 0x3178, # 0x11E7 => 0x317A, 0x11E8 => 0x317C, 0x11EA => 0x317D, 0x11EB => 0x317F, # 0x11EE => 0x3180, 0x11F0 => 0x3181, 0x11F1 => 0x3182, 0x11F2 => 0x3183, # 0x11F4 => 0x3184, 0x11F9 => 0x3186, 0x11FF => 0x3165, 0xA964 => 0x313A, 0xA966 => 0x316A, 0xA968 => 0x313B, 0xA969 => 0x313C, 0xA96C => 0x313D, 0xA971 => 0x316F, # 0xD7CD => 0x3138, 0xD7E3 => 0x3173, 0xD7E6 => 0x3143, 0xD7E7 => 0x3175, # 0xD7E8 => 0x3176, 0xD7EF => 0x317E, 0xD7F9 => 0x3149, ); my @HYpuaJamo; if ($OPT{frompua} or $OPT{topua}) { @HYpuaJamo = arr_hypua2jamo(); } ##### main routine ##### binmode (STDIN,":utf8"); binmode (STDOUT,":utf8"); while (<>) { print STDERR "."; &syllable2jamo; if ($OPT{frompua}) { &hypua2jamo; &hypuasingle2jamo; } &compose_jamo; &insert_filler; &ascii2tonemark if $OPT{latintm}; &insert_boundary if $OPT{boundary}; &reorder_tonemark if $OPT{reordertm}; unless ($OPT{decompose}) { &jamo2hypua if $OPT{topua}; &jamo2syllable; &jamo2jamocomp if $OPT{compatjamo}; } &normalize_hanja if $OPT{normalhanja}; print; } print STDERR "\n"; ##### subroutines ##### sub reorder_tonemark { my @blocks = split /($syllblock)/; for my $i (0 .. $#blocks) { if ($i % 2 == 0) { $blocks[$i] =~ s/([$tmrk])/$1\x{25CC}/g; } else { $blocks[$i] =~ s/([$cho$jung$jong]+)([$tmrk])$/$2$1/; } } $_ = join '',@blocks; } sub ascii2tonemark { s/\x{B7}([$cho][$jung][$jong]?)(?![$tmrk])/$1\x{302E}/g; s/\x{3A}([$cho][$jung][$jong]?)(?![$tmrk])/$1\x{302F}/g; } sub hypua2jamo { for my $i (0 .. $#HYpuaJamo) { my $puachar = chr $HYpuaJamo[$i][0]; my $jamoseq = ""; for my $j (1 .. $#{$HYpuaJamo[$i]}) { $jamoseq .= chr $HYpuaJamo[$i][$j]; } s/$puachar/$jamoseq/g; } } sub jamo2hypua { for my $i (0 .. $#HYpuaJamo) { my $puachar = chr $HYpuaJamo[$i][0]; my $jamoseq = ""; for my $j (1 .. $#{$HYpuaJamo[$i]}) { $jamoseq .= chr $HYpuaJamo[$i][$j]; } s/$jamoseq/$puachar/g; } } sub insert_boundary { chomp; my @parts = split /($syllblock)/, $_; my @newparts; for my $i (0 .. $#parts) { if ($i % 2 == 0) { next unless $parts[$i]; my @subparts = split /($jamos)/, $parts[$i]; my @newsubparts; for my $i (0 .. $#subparts) { push @newsubparts, $subparts[$i] if $subparts[$i]; } push @newparts, join ($boundary, @newsubparts); } else { push @newparts, $parts[$i]; } } $_ = join $boundary, @newparts; $_ .= "\n"; } sub insert_filler { s/(?