# -*- coding: cn-big5; -*- package ChineseNumbers; require Exporter; use strict; use subs qw{EnglishToChineseNumber ChineseToEnglishNumber}; # Author: Erik Peterson # E-mail: erik@mandarintools.com # Source: http://www.mandarintools.com/numbers.html # # Usage: # # use ChineseNumbers; # # ChineseNumbers->EnglishToChineseNumber(enumber, [output_type]) # enumber is an integer # output_type (which is optional) can be # big5 : Output Chinese using Big5 # formalb5 : Output as formal numbers in Big5 # gb : Output using GB # formalgb : Output as formal numbers in GB (not working yet) # utf8 : Output as (traditional character) UTF-8 # unicodehex: Output as 4-digit Unicode hex blocks # pinyin : Output as Hanyu Pinyin # jyutpin : Output as Cantonese jyutpin romanization # yalecant : Output as Cantonese Yale romanization # The default is big5 # # ChineseNumbers->ChineseToEnglishNumber(cnumber, [input_type]) # cnumber is a string in GB, Big5, UTF-8 # input_type is "big5", "gb", or "utf8", depending on cnumber # default is "big5" # # ChineseNumbers->chinese_output([option]) # Set the default output type used by EnglishToChineseNumber # option can be any of the output options for EnglishToChineseNumber # If no arguments, returns the current default # # ChineseNumbers->chinese_input([option]) # Set the default input type used by ChineseToEnglishNumber # option can be "big5", "gb", or "utf8" # If no arguments, returns the current default BEGIN { } my $minus = "璽"; my @digits = ("箂", "", "", "", "", "き", "せ", "", "", ""); my %digits = ("箂", 0, "", 1, "", 2, "ㄢ", 2, "", 3, "", 4, "き", 5, "せ", 6, "", 7, "", 8, "", 9); my @beforeWan = ("", "κ", ""); my @afterWan = ("", "窾", "货", "", "ㄊ"); my %beforeWan = ("", 10, "κ", 100, "", 1000); my %afterWan = ("窾", 10000, "货", 100000000, "", 1000000000000, "ㄊ", 10000000000000000); my $ALTTWO = "ㄢ"; my $TEN = 10; my $default_outputtype = "big5"; my $default_inputtype = "big5"; my %trad2simp = ("璽" => "负", "箂" => "零", "" => "一", "" => "二", "" => "三", "" => "四", "き" => "五", "せ" => "六", "" => "七", "" => "八", "" => "九", "" => "十", "κ" => "百", "" => "千", "窾" => "万", "货" => "亿", "" => "兆", "ㄢ" => "两"); my %simp2trad = ("负" => "璽", "零" => "箂", "一" => "", "二" => "", "三" => "", "四" => "", "五" => "き", "六" => "せ", "七" => "", "八" => "", "九" => "", "十" => "", "百" => "κ", "千" => "", "万" => "窾", "亿" => "货", "兆" => "", "两" => "ㄢ"); my %trad2formal = ("璽" => "璽", "箂" => "箂", "" => "滁", "" => "禠", "" => "把", "" => "竩", "き" => "ヮ", "せ" => "嘲", "" => "琺", "" => "", "" => "╤", "" => "珺", "κ" => "ㄕ", "" => "", "窾" => "窾", "货" => "货", "" => "", "ㄢ" => "ㄢ"); my %trad2formalgb = ("璽" => "", "箂" => "", "" => "", "" => "", "" => "", "" => "", "き" => "", "せ" => "", "" => "", "" => "", "" => "", "" => "", "κ" => "", "" => "", "窾" => "", "货" => "", "" => "", "ㄢ" => ""); my %trad2unicode = ("璽" => "8CA0", "箂" => "96F6", "" => "4E00", "" => "4E8C", "" => "4E09", "" => "56DB", "き" => "4E94", "せ" => "516D", "" => "4E03", "" => "516B", "" => "4E5D", "" => "5341", "κ" => "767E", "" => "5343", "窾" => "842C", "货" => "5104", "" => "5146", "ㄢ" => "5169"); my %unicode2trad = ("8CA0" => "璽", "8D1F" => "璽", # simp "96F6" => "箂", "4E00" => "", "4E8C" => "", "4E09" => "", "56DB" => "", "4E94" => "き", "516D" => "せ", "4E03" => "", "516B" => "", "4E5D" => "", "5341" => "", "767E" => "κ", "5343" => "", "842C" => "窾", "4E07" => "窾", # simp "5104" => "货", "4EBF" => "货", # simp "5146" => "", "5169" => "ㄢ", # simp "4E24" => "ㄢ"); my %trad2pinyin = ("璽" => "fu4", "箂" => "ling2", "" => "yi1", "" => "er4", "" => "san1", "" => "si4", "き" => "wu3", "せ" => "liu4", "" => "qi1", "" => "ba1", "" => "jiu3", "" => "shi2", "κ" => "bai3", "" => "qian1", "窾" => "wan4", "货" => "yi4", "" => "zhao4", "ㄢ" => "liang3"); my %trad2yalecant = ("璽" => "fu", "箂" => "ling2", "" => "yat", "" => "yih7", "" => "saam1", "" => "sei5", "き" => "ng4", "せ" => "luhk", "" => "chat1", "" => "baat1", "" => "gao3", "" => "sap7", "κ" => "baak5", "" => "chin1", "窾" => "maahn", "货" => "yik1", "" => "siu", "ㄢ" => "leung4"); my %trad2jyutpin = ("璽" => "fu6", "箂" => "ling4", "" => "jat1", "" => "ji6", "" => "saam1", "" => "sei3", "き" => "ng5", "せ" => "luk6", "" => "cat1", "" => "baat3", "" => "gau2", "" => "sap6", "κ" => "baak3", "" => "cin1", "窾" => "maan6", "货" => "jik1", "" => "siu6", "ㄢ" => "loeng5"); sub new { return bless {}; } # The heart of the program. Does the actual conversion sub EnglishToChineseNumber { my($self) = shift; my($enumber) = shift; my($outputtype) = shift; if ($outputtype eq "") { $outputtype = $default_outputtype; } $outputtype = lc($outputtype); # print "Output type : $outputtype\n"; my(@powers) = (); my($power) = 0; my($value) = 0; my($negative) = 0; # is it a negative integer? my($inzero) = 0; # are we in a stretch or 1 or more zeros (only add one zero for the stretch) my($canaddzero) = 0; # only add a zero if there's something non-zero on both sides of it my($cnumber) = ""; # the final result # Remove all non-digits $enumber =~ s/[^0-9\.-]//g; # If zero, just return zero if ($enumber == 0) { return $digits[0]; } # Check if it's negative, set the negative flag and make it positive if ($enumber < 0) { $negative = 1; $enumber = -$enumber; } # Get the value of the coefficient for each power of ten while ($TEN ** $power <= $enumber) { $value = ($enumber % ($TEN** ($power+1)))/($TEN**$power); $powers[$power] = $value; # Subtract out the current power's coefficient and increase the power $enumber -= $enumber % ($TEN**($power+1)); $power++; } my($i); # Take the decomposition of the number for above and generate the Chinese equivalent for ($i = 0; $i < $power; $i++) { #System.out.println("10^" + i + ":\t" + powers[i]); if (($i % 4) == 0) { # Reached the next four powers up level if ($powers[$i] != 0) { $inzero = 0; $canaddzero = 1; $cnumber = $digits[$powers[$i]] . $afterWan[$i/4] . $cnumber; } else { # Check that something in the next three powers is non-zero before adding if ((($i+3 < $power) && $powers[$i+3] != 0) || (($i+2 < $power) && $powers[$i+2] != 0) || (($i+1 < $power) && $powers[$i+1] != 0)) { $cnumber = $afterWan[$i/4] . $cnumber; $canaddzero = 0; # added } } } else { # Add one, tens, hundreds, or thousands place for each level if ($powers[$i] != 0) { $inzero = 0; $canaddzero = 1; if ($power == 2 && $i == 1 && $powers[$i] == 1) { # No  with 10 through 19 $cnumber = $beforeWan[($i % 4)-1] . $cnumber; #} else if ((i%4 = 3) && powers[i] == 2) { # when to use liang3 vs. er4 #cnumber.insert(0, ALTTWO + beforeWan[(i%4)-1]); } else { $cnumber = $digits[$powers[$i]] . $beforeWan[($i%4)-1] . $cnumber; } } else { if ($canaddzero == 1 && $inzero == 0) { # Only insert one 箂 for all consecutive zeroes $inzero = 1; $cnumber = $digits[$powers[$i]] . $cnumber; } } } } # Add the negative character if ($negative == 1) { $cnumber = $minus . $cnumber; } my($result, $j); if ($outputtype eq "big5") { $result = $cnumber; } elsif ($outputtype eq "gb") { for ($j = 0; $j < length($cnumber); $j+=2) { $result .= $trad2simp{substr($cnumber, $j, 2)}; } } elsif ($outputtype eq "formalb5") { for ($j = 0; $j < length($cnumber); $j+=2) { $result .= $trad2formal{substr($cnumber, $j, 2)}; } } elsif ($outputtype eq "formalgb") { for ($j = 0; $j < length($cnumber); $j+=2) { $result .= $trad2simp{substr($cnumber, $j, 2)}; #$result .= $trad2formalgb{substr($cnumber, $j, 2)}; } } elsif ($outputtype eq "utf8") { for ($j = 0; $j < length($cnumber); $j+=2) { $result .= &hex2utf8($trad2unicode{substr($cnumber, $j, 2)}); } } elsif ($outputtype eq "unicodehex") { for ($j = 0; $j < length($cnumber); $j+=2) { $result .= $trad2unicode{substr($cnumber, $j, 2)} . " "; } } elsif ($outputtype eq "pinyin") { for ($j = 0; $j < length($cnumber); $j+=2) { $result .= $trad2pinyin{substr($cnumber, $j, 2)} . " "; } } elsif ($outputtype eq "jyutpin") { for ($j = 0; $j < length($cnumber); $j+=2) { $result .= $trad2jyutpin{substr($cnumber, $j, 2)} . " "; } } elsif ($outputtype eq "yalecant") { for ($j = 0; $j < length($cnumber); $j+=2) { $result .= $trad2yalecant{substr($cnumber, $j, 2)} . " "; } } else { $result = $cnumber; } return $result; } sub ChineseToEnglishNumber { my($self) = shift; my($inputnumber) = shift; my($inputtype); if (@_) { $inputtype = shift; } else { $inputtype = $default_inputtype; } my($i, $j); my($alldigits) = 1; my($cnumber); if ($inputtype eq "gb") { for ($j = 0; $j < length($inputnumber); $j+=2) { $cnumber .= $simp2trad{substr($inputnumber, $j, 2)}; } } elsif ($inputtype eq "utf8") { for ($j = 0; $j < length($inputnumber); $j+=3) { $cnumber .= $unicode2trad{&utf82hex(substr($inputnumber, $j, 3))}; } } else { $cnumber = $inputnumber; } if (length($cnumber) > 2) { for ($i = 0; $i < length($cnumber); $i+=2) { if (!defined($digits{substr($cnumber, $i, 2)})) { $alldigits = 0; } } if ($alldigits == 1) { return &ChineseToEnglishBrief($cnumber); } else { return &ChineseToEnglishFull($cnumber); } } else { return &ChineseToEnglishFull($cnumber); } } sub ChineseToEnglishBrief { my($cnumber) = shift; my($nextcchar); my($place, $digitval, $total) = (0,0,0); for ($place = 0; $place < length($cnumber)/2; $place++) { $digitval = $digits{substr($cnumber, $place*2, 2)}; $total += $digitval * (10**(length($cnumber)/2 - 1 - $place)); } return $total; } sub ChineseToEnglishFull { my($cnumber) = shift; my($negative) = 0; my($cnumlength) = length($cnumber); my($i); my($j, $digitval, $cchar); my($power) = 0; my($leveltotal) = 0; my($total); my($nextcchar); #print "In Chinese to English Full
"; for ($i = 0; $i < $cnumlength; $i+=2) { #print "$i "; $cchar = substr($cnumber, $i, 2); #print "$cchar $leveltotal $power
"; if ($i == 0 && ($cchar eq "" || $cchar eq '璽')) { $negative = 1; } elsif ($i == 0 && $cchar eq '材') { # Do nothing for now } elsif ($cchar eq '') { $power = 12; $leveltotal = 1 if $leveltotal == 0; $total += $leveltotal * (10 ** $power); $leveltotal = 0; $power -= 4; #$power = 0; } elsif ($cchar eq '货') { $power = 8; $leveltotal = 1 if $leveltotal == 0; $total += $leveltotal * (10** $power); $leveltotal = 0; $power -= 4; #$power = 0; } elsif ($cchar eq '窾') { $power = 4; $leveltotal = 1 if $leveltotal == 0; $total += $leveltotal * (10**$power); $leveltotal = 0; $power -= 4; #$power = 0; } elsif ($cchar eq '') { $leveltotal += 1000; } elsif ($cchar eq "κ") { $leveltotal += 100; } elsif ($cchar eq "") { $leveltotal += 10; } elsif ($cchar eq "箂") { $power = 0; } elsif ($cchar eq "箂" || $cchar eq "" || $cchar eq "ㄢ" || $cchar eq "" || $cchar eq "" || $cchar eq "" || $cchar eq "き" || $cchar eq "せ" || $cchar eq "" || $cchar eq "" || $cchar eq "") { $digitval = $digits{substr($cnumber, $i, 2)}; #print "Digit val is $digitval
\n"; if ($i+2 < $cnumlength) { $nextcchar = substr($cnumber, $i+2, 2); if ($nextcchar eq "") { $leveltotal += $digitval * 10; $i+=2; } elsif ($nextcchar eq "κ") { $leveltotal += $digitval * 100; $i+=2; } elsif ($nextcchar eq "") { $leveltotal += $digitval * 1000; $i+=2; } else { $leveltotal += $digitval; } } else { $leveltotal += $digitval; } } else { print STDERR "Seems to be an error in the number. $cnumber\n"; return ""; # return negative infinity; } } # Catch remaining leveltotal #print("Level total " + $leveltotal + " power " + $power + " ten to power " + (10**$power)/10); $total += $leveltotal * 10** $power; if ($negative == 1) { $total = -$total; } return $total; } sub chinese_output { my($self) = shift; if (@_) { $default_outputtype = shift } return $default_outputtype; } sub chinese_input { my($self) = shift; if (@_) { $default_inputtype = shift } return $default_inputtype; } # hex2utf8: Take a string of 4 hex digits (0-9A-F) and convert it # to the corresponding (1, 2, or 3 byte) UTF-8 representation. sub hex2utf8 { my($hexchar) = shift; my($binchar, $retval, $bin1, $bin2, $bin3); if ($hexchar !~ m/^0x/) { $hexchar = "0x" . $hexchar; } #print STDERR $hexchar ."\n"; $binchar = oct($hexchar); if ($binchar <= 127) { $retval = pack("C", $binchar); } elsif ($binchar <= 2047) { $bin1 = $binchar; $bin1 >>= 6; $bin1 |= 0xC0; $bin2 = $binchar; $bin2 &= 0x3F; $bin2 |= 0x80; $retval = pack("C2", $bin1, $bin2); } else { $bin1 = $binchar; $bin1 >>= 12; $bin1 |= 0xE0; $bin2 = $binchar; $bin2 &= 0x0FFF; $bin2 >>= 6; $bin2 |= 0x80; $bin3 = $binchar; $bin3 &= 0x003F; $bin3 |= 0x80; $retval = pack("C*", $bin1, $bin2, $bin3); } return $retval; } sub utf82hex { my($utfstring) = @_; my($unichar, $unival, $unistring, $i, $int1, $int2, $int3, $byte1, $byte2, $byte3); my($hex1, $hex2, $hexstring); $i = 0; while ($i < length($utfstring)) { $byte1 = substr($utfstring, $i, 1); if (unpack("C", $byte1) <= 0x7F) { # 1 byte long (ASCII) $unichar = pack("C", 0x00) . $byte1; $i++; } elsif ((unpack("C", $byte1) & 0xE0) == 0xC0) { # 2 bytes long $byte2 = substr($utfstring, $i+1, 1); $int1 = unpack("C", $byte1) & 0x1F; $int1 <<= 0x06; $int2 = unpack("C", $byte2) & 0x3F; $unival = $int1 | $int2; $unichar = pack("CC", (0xFF00 & $unival) >> 8, (0x00FF & $unival)); $i += 2; } else { # 3 bytes long $byte2 = substr($utfstring, $i+1, 1); $byte3 = substr($utfstring, $i+2, 1); $int1 = 0x0F & unpack("C", $byte1); $int1 <<= 12; $int2 = 0x3F & unpack("C", $byte2); $int2 <<= 6; $int3 = 0x3F & unpack("C", $byte3); $unival = $int1 | $int2 | $int3; $unichar = pack("CC", (0xFF00 & $unival) >> 8, (0x00FF & $unival)); $i += 3; } $unistring .= $unichar; } $hex1 = unpack "H2", substr($unistring, 0, 1); $hex2 = unpack "H2", substr($unistring, 1, 1); $hexstring = "\U$hex1$hex2\E"; return $hexstring; } END { } 1;