package Slim::Utils::Unicode; # $Id$ =head1 NAME Slim::Utils::Unicode =head1 SYNOPSIS my $utf8 = Slim::Utils::Unicode::utf8decode($string, 'iso-8859-1'); my $enc = Slim::Utils::Unicode::encodingFromString($string); =head1 DESCRIPTION This module is a wrapper around Encode:: functions, and comprises character set guessing, encoding, decoding and translation. Most of these functions are non-ops on perl < 5.8.x The recompose & decompose parts have been modified from Simon's defunct Unicode::Decompose mdoule. Unicode characters that somehow become decomposed. Sometimes we'll see a URI encoding such as: o%CC%88 - which is an o with diaeresis. The correct (composed) version of this should be %C3%B6 =head1 METHODS =cut use strict; use Fcntl qw(:seek); use File::BOM; use POSIX qw(LC_CTYPE LC_TIME); use Text::Unidecode; use Slim::Utils::Log; BEGIN { my $hasEDD; sub hasEDD { return $hasEDD if defined $hasEDD; $hasEDD = 0; eval { require Encode::Detect::Detector; $hasEDD = 1; }; return $hasEDD; } } sub _printPath { use bytes; my $path = shift; return 'undef' unless defined $path; my $s = utf8::is_utf8($path) ? 'ON:' : 'OFF:'; for (my $i = 0; $i < length($path); $i++) { my $c = substr($path, $i, 1); if (ord($c) > 127) { $s .= sprintf("#%02x", ord($c)); } else { $s .= $c; } } return $s; } # Find out what code page we're in, so we can properly translate file/directory encodings. our ( $lc_ctype, $lc_time, $comb_re_bits, $bomRE, $FB_QUIET, $FB_CROAK, ); { # We implement a decode() & encode(), so don't import those. require Encode; require Encode::Guess; $FB_QUIET = Encode::FB_QUIET(); $FB_CROAK = Encode::FB_CROAK(); $bomRE = qr/^(?: \xef\xbb\xbf | \xfe\xff | \xff\xfe | \x00\x00\xfe\xff | \xff\xfe\x00\x00 )/x; ($lc_ctype, $lc_time) = Slim::Utils::OSDetect->getOS->localeDetails(); # Setup Encode::Guess $Encode::Guess::NoUTFAutoGuess = 1; # Setup suspects for Encode::Guess based on the locale - we might also # want to use our own Language pref? if ($lc_ctype ne 'utf8') { Encode::Guess->add_suspects($lc_ctype); } # Regex for determining if a string contains combining marks and needs to be decomposed # Combining Diacritical Marks + Combining Diacritical Marks Supplement $comb_re_bits = join "|", map { latin1toUTF8(chr($_)) } (0x300..0x36F, 0x1DC0..0x1DFF); } =head2 currentLocale() Returns the current system locale. On Windows, this is the current code page. On OS X, it's always utf-8. On *nix, this is LC_CTYPE. =cut sub currentLocale { return $lc_ctype; } =head2 utf8decode( $string ) Decode the current string to UTF-8 Return the newly decoded string. =cut sub utf8decode { return utf8decode_guess(@_); } =head2 utf8decode_guess( $string, @encodings ) Decode the current string to UTF-8, using @encodings to guess the encoding of the string if it is not known. Return the newly decoded string. =cut sub utf8decode_guess { my $string = shift; my @preferedEncodings = @_; # Bail early if it's just ascii if (looks_like_ascii($string) || utf8::is_utf8($string)) { return $string; } if ( @preferedEncodings ) { for my $encoding (@preferedEncodings) { $string = eval { Encode::decode($encoding, $string, $FB_QUIET) }; if ( !$@ && utf8::is_utf8($string) ) { return $string; } } } my $charset = encodingFromString($string); if ( $charset && $charset ne 'raw') { my $encoding; if (($encoding = Encode::find_encoding($charset)) && ref $encoding) { return $encoding->decode($string, $FB_QUIET); } } return $string; } =head2 utf8decode_locale( $string ) Decode the current string to UTF-8, using the current locale as the string's encoding. This is a no-op if the string is already encoded as UTF-8 Return the newly decoded string. =cut sub utf8decode_locale { my $string = shift; if ($string && !utf8::is_utf8($string)) { my $decoded = eval { Encode::decode($lc_ctype, $string, $FB_CROAK) }; $string = ($@) ? $string : $decoded; } return $string; } =head2 utf8encode( $string, $encoding ) Encode the current UTF-8 string to the passed encoding. Return the newly encoded string. =cut sub utf8encode { my $string = shift; my $encoding = shift || 'utf8'; if ($string && $encoding ne 'utf8') { utf8::decode($string) unless utf8::is_utf8($string); $string = Encode::encode($encoding, $string, $FB_QUIET); } elsif ($string) { utf8::encode($string) if utf8::is_utf8($string); } return $string; } =head2 utf8encode_locale( $string ) Encode the current UTF-8 string to the current locale. Return the newly encoded string. =cut sub utf8encode_locale { return utf8encode($_[0], $lc_ctype); } =head2 encode_locale( $string ) Encode the current (possibly Unicode) character-string to the current locale. This guarantees that the internal utf-8 flag will be unset, even if it was erroneously set for a string that contains only ASCII (as will be the case with strings from preferences because of the behavior of YAML::Syck). It will not handle the case where the string might actually contain UTF-8 but has not been decoded. Use utf8encode_local() in that case. Return the newly encoded string. =cut sub encode_locale { my $string = shift; return Encode::encode($lc_ctype, $string, $FB_QUIET) if utf8::is_utf8($string); return $string; } =head2 utf8off( $string ) Alias for Encode::encode('utf8', $string) Returns the new string. =cut sub utf8off { utf8::encode($_[0]) if utf8::is_utf8($_[0]); return $_[0]; } =head2 utf8on( $string ) Alias for Encode::decode('utf8', $string) Returns the new string. =cut sub utf8on { utf8::decode($_[0]) unless utf8::is_utf8($_[0]); return $_[0]; } =head2 looks_like_ascii( $string ) Returns true if the passed string is US-ASCII Returns false otherwise. =cut sub looks_like_ascii { use bytes; return 1 if !$_[0]; return 1 if $_[0] !~ /[^\x00-\x7F]/; return 0; } =head2 looks_like_latin1( $string ) Returns true if the passed string is ISO-8859-1 Returns false otherwise. =cut sub looks_like_latin1 { use bytes; return 1 if $_[0] !~ /[^\x00-\x7F\xA0-\xFF]/; return 0; } =head2 looks_like_cp1252( $string ) Returns true if the passed string is Windows-1252 Returns false otherwise. =cut sub looks_like_cp1252 { use bytes; return 1 if $_[0] !~ /[^\x00-\xFF]/; return 0; } =head2 looks_like_utf8( $string ) Returns true if the passed string is UTF-8 Returns false otherwise. =cut sub looks_like_utf8 { # From http://keithdevens.com/weblog/archive/2004/Jun/29/UTF-8.regex # with a fix for the ASCII part return 1 if $_[0] =~ /^(?: [\x00-\x7E] # ASCII | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 )*$/x; return 0; } =head2 looks_like_utf16( $string ) Returns true if the passed string is UTF-16 Returns false otherwise. =cut sub looks_like_utf16 { use bytes; return 1 if $_[0] =~ /^(?:\xfe\xff|\xff\xfe)/; return 0; } =head2 looks_like_utf32( $string ) Returns true if the passed string is UTF-32 Returns false otherwise. =cut sub looks_like_utf32 { use bytes; return 1 if $_[0] =~ /^(?:\x00\x00\xfe\xff|\xff\xfe\x00\x00)/; return 0; } =head2 latin1toUTF8( $string ) Returns a UTF-8 encoded string from a ISO-8859-1 string. =cut sub latin1toUTF8 { my $data = shift; $data = eval { Encode::encode('utf8', $data, $FB_QUIET) } || $data; return $data; } =head2 utf8toLatin1( $string ) Returns a ISO-8859-1 string from a UTF-8 encoded string. =cut sub utf8toLatin1 { my $data = shift; $data = eval { Encode::encode('iso-8859-1', $data, $FB_QUIET) } || $data; return $data; } =head2 utf8toLatin1Transliterate( $string ) Turn a UTF-8 string into it's US-ASCII equivalent. See L. =cut sub utf8toLatin1Transliterate { my $data = shift; return utf8toLatin1( Text::Unidecode::unidecode($data) ); } =head2 encodingFromString( $string, $ignore_utf8_flag ) Use a best guess effort to return the encoding of the passed string. Returns 'raw' if not: ascii, utf-32, utf-16, utf-8, iso-8859-1 or cp1252 =cut sub encodingFromString { my $string = shift; return 'utf8' if !$_[0] && utf8::is_utf8($string); # Don't copy a potentially large string - just read it from the stack. if (looks_like_ascii($string)) { return 'ascii'; } elsif (looks_like_utf32($string)) { return 'utf-32'; } elsif (looks_like_utf16($string)) { return 'utf-16'; } elsif (looks_like_utf8($string)) { return 'utf8'; } elsif (looks_like_latin1($string)) { return 'iso-8859-1'; } elsif (looks_like_cp1252($string)) { return 'cp1252'; } if ( !hasEDD() ) { return 'raw'; } # Check Encode::Detect::Detector before ISO-8859-1, as it can find # overlapping charsets. my $charset = Encode::Detect::Detector::detect($string); # # Encode::Detect::Detector is mislead to to return Big5 with some characters # # In these cases Encode::Guess does a better job... (bug 9553) if ($charset =~ /^(?:big5|euc-jp|euc-kr|euc-cn|euc-tw)$/i) { eval { $charset = Encode::Guess::guess_encoding($string); $charset = $charset->name; }; # Bug 10671: sometimes Encode::Guess returns ambiguous results like "ascii or utf8" if ($@) { logError($@); if ($charset =~ /utf8/i) { $charset = 'utf8'; logError("Falling back to: $charset"); } else { $charset = ''; } } } $charset =~ s/utf-8/utf8/i; if ($charset) { return lc($charset); } return 'raw'; } =head2 encodingFromFileHandle( $fh ) Use a best guess effort to return the encoding of the passed file handle. Returns 'raw' if not: ascii, utf-32, utf-16, utf-8, iso-8859-1 or cp1252 =cut sub encodingFromFileHandle { my $fh = shift; # If we didn't get a filehandle, not much we can do. if (!ref($fh) || !$fh->can('seek')) { logBacktrace("Didn't get a filehandle from caller!"); return; } local $/ = undef; # Save the old position (if any) # And find the file size. # # These must be seek() and not sysseek(), as File::BOM uses seek(), # and they'll get confused otherwise. my $maxsz = 4096; my $pos = tell($fh); my $size = seek($fh, 0, SEEK_END) ? tell($fh) : $maxsz; # Set a limit on the size to read. if ($size > $maxsz || $size < 0) { $size = $maxsz; } # Don't do any translation. binmode($fh, ":raw"); # Try to find a BOM on the file - otherwise check the string # # Although get_encoding_from_filehandle tries to determine if # the handle is seekable or not - the Protocol handlers don't # implement a seek() method, and even if they did, File::BOM # internally would try to read(), which doesn't mix with # sysread(). So skip those m3u files entirely. my $enc = ''; # Explitly check for IO::String - as it does have a seek() method! if (ref($fh) && ref($fh) ne 'IO::String' && $fh->can('seek')) { $enc = File::BOM::get_encoding_from_filehandle($fh); } # File::BOM got something - let's get out of here. return $enc if $enc; # Seek to the beginning of the file. seek($fh, 0, SEEK_SET); # read($fh, my $string, $size); # Seek back to where we started. seek($fh, $pos, SEEK_SET); return encodingFromString($string); } =head2 encodingFromFile( $fh ) Use a best guess effort to return the encoding of the passed file name or file handle. Returns 'raw' if not: ascii, utf-32, utf-16, utf-8, iso-8859-1 or cp1252 =cut sub encodingFromFile { my $file = shift; my $encoding = $lc_ctype; if (ref($file) && $file->can('seek')) { $encoding = encodingFromFileHandle($file); } elsif (-r $file) { my $fh = FileHandle->new; $fh->open($file) or do { logError("Couldn't open file: [$file] : $!"); return $encoding; }; $encoding = encodingFromFileHandle($fh); $fh->close(); } else { logBacktrace("Not a filename or filehandle: [$file]"); } return $encoding; } =head2 recomposeUnicode( $string ) Recompose a decomposed UTF-8 string. =cut sub recomposeUnicode { my $string = shift; require Slim::Utils::Unicode::Recompose; $string =~ s/$Slim::Utils::Unicode::Recompose::regex/$Slim::Utils::Unicode::Recompose::table{$1}/go; return $string; } =head2 hasCombiningMarks( $string ) Returns 1 if the string contains any characters that are combining marks. =cut sub hasCombiningMarks { return $_[0] =~ /(?:$comb_re_bits)/o ? 1 : 0; } =head2 stripBOM( $string ) Removes UTF-8, UTF-16 & UTF-32 Byte Order Marks and returns the string. =cut sub stripBOM { my $string = shift; use bytes; $string =~ s/$bomRE//; return $string; } =head2 decode( $encoding, $string ) An alias for L =cut *decode = \&Encode::decode; =head2 encode( $encoding, $string ) An alias for L =cut *encode = \&Encode::encode; =head2 from_to( $string, $from_encoding, $to_encoding ) =cut sub from_to { my $string = shift; my $from_encoding = shift; my $to_encoding = shift; return $string if $from_encoding eq $to_encoding; # wrap transformation in eval as utf8 -> iso8859-1 could break on wide characters eval { $string = decode($from_encoding, $string); $string = encode($to_encoding, $string); }; if ($@) { Slim::Utils::Log->logger('server')->warn("Could not convert from $from_encoding to $to_encoding: $string ($@)"); } return $string; } =head1 SEE ALSO L, L, L =cut 1; __END__