#!/usr/local/bin/perl # Convert UTF-8 encoded files to use numerical character references # i.e. 係 if ($#ARGV == -1) { print "Please supply the name of the file to convert.\n"; } open(FD, $ARGV[0]) or die "Can't open $!\n"; while ($inline = ) { chomp($inline); @chars = (); $outline = ""; (@chars) = ($inline =~ m/([\x01-\x7f]| [\xc0-\xdf][\x80-\xbf]| [\xe0-\xef][\x80-\xbf][\x80-\xbf])/xg); foreach $char (@chars) { if (length($char) == 1) { $outline .= $char; } elsif (length($char) == 2) { $unival = (vec($char, 0, 8) & 0x1f) * 0x40 + (vec($char, 1, 8) & 0x3f); $outline .= "\&#$unival;"; } elsif (length($char) == 3) { $unival = (vec($char, 0, 8) & 0x0f) * 0x1000 + (vec($char, 1, 8) & 0x3f) * 0x40 + (vec($char, 2, 8) & 0x3f); $outline .= "\&#$unival;"; } } print $outline . "\n"; } close(FD);