#!/usr/bin/perl -w use strict; =head1 NAME uni2utf8 =head1 SYNOPSIS uni2utf8 [options] number [number...] =head1 DESCRIPTION Converts unicode into utf-8. Supports these options: =over 4 =item -z Put a single space every half-byte and a triple space every byte. =back =cut my %options; $options{z} = 1; for my $arg (@ARGV) { if (length($arg) > 7) { print_bin($arg); } elsif ($arg =~ m/[abcdefABCDEF]/) { print_hex("0x" . $arg); } elsif (substr($arg, 0, 2) eq '0x') { print_hex($arg); } elsif (substr($arg, 0, 1) eq '0') { print_oct($arg); } else { print_dec($arg); } } exit 0; ######################################################################## ############################# SUBROUTINES ############################## ######################################################################## sub print_bin { my $result = binary_uni2utf8(@_); if ($options{z}) { print substr($result, 0, 4), " ", substr($result, 4, 4); if (length($result) > 8) { print " ", substr($result, 8, 4), " ", substr($result, 12, 4); if (length($result) > 16) { print " ", substr($result, 16, 4), " ", substr($result, 20, 4); } } print "\n"; } else { print $result, "\n"; } } sub print_hex { my $result = binary_uni2utf8(sprintf("%016b", oct $_[0])); if (length($result) == 8) { print sprintf("%02x", oct("0b$result")), "\n"; } elsif (length($result) == 16) { print sprintf("%04x", oct("0b$result")), "\n"; } else { print sprintf("%06x", oct("0b$result")), "\n"; } } sub print_oct { my $result = binary_uni2utf8(sprintf("%016b", oct $_[0])); if (length($result) == 8) { print sprintf("%02o", oct("0b$result")), "\n"; } elsif (length($result) == 16) { print sprintf("%04o", oct("0b$result")), "\n"; } else { print sprintf("%06o", oct("0b$result")), "\n"; } } sub print_dec { my $result = binary_uni2utf8(sprintf("%016b", oct $_[0])); if (length($result) == 8) { print sprintf("%02d", oct("0b$result")), "\n"; } elsif (length($result) == 16) { print sprintf("%04d", oct("0b$result")), "\n"; } else { print sprintf("%06d", oct("0b$result")), "\n"; } } =item binary_uni2utf8 Takes a 16-char string of 0s and 1s in Unicode. Returns a string of 0s to 1s in UTF-8, either 8, 16, or 24 chars. This function mangles 32-bit unicode values. =cut sub binary_uni2utf8 { my $unicode = shift; if (substr($unicode, 0, 8) eq '00000000') { return "0" . substr($unicode, 9, 7); } elsif (substr($unicode, 0, 5) eq '00000') { return "110" . substr($unicode, 5, 5) . "10" . substr($unicode, 10, 6); } else { return "1110" . substr($unicode, 0, 4) . "10" . substr($unicode, 4, 6) . "10" . substr($unicode, 10, 6); } }