[Mew-dist 05699] Re: code_check_body in Japanese.pm problem

1998年 7月 27日 (月) 20:43:30 JST

>> In article <19980727202200O.os at example.com>, OHARA Shigeki <os at example.com> writes:

> これは、code_check_body の問題のように思えるんですが、
> どうなんでしょうか？

さっき img に投げたのですが、Japanese.pm.in を以下のものと置き換えてみ
て下さい。
-- 
酒井 清隆 (E-mail: ksakai at example.com)

-------------- next part --------------
# -*-Perl-*-
################################################################
###
###			     Japanese.pm
###
###	      Copyright (C) 1997  Internet Message Group
###
###		     This Perl5 library conforms
###		GNU GENERAL PUBLIC LICENSE Version 2.
###
###
### Author:  Internet Message Group <img at example.com>
### Created: Apr 23, 1997
### Revised: @im_revised@
###

my $PM_VERSION = "IM::Japanese.pm @im_version@";

package IM::Japanese;
require 5.003;
require Exporter;

use IM::Util;
use integer;
use strict;
use vars qw(@ISA @EXPORT);
@ISA = qw(Exporter);
@EXPORT = qw(code_check code_check_body
	     convert_iso2022jp_body conv_iso2022jp);

=head1 NAME

Japanese - IM Japanese handler

=head1 SYNOPSIS

=head1 DESCRIPTION

=cut

use vars qw($C_jis $C_jis_roman $C_sjis $C_sjis_kana
	    $C_euc $C_euc_kana $C_SorE $C_ascii
	    $C_pascii $C_tascii $C_sascii $C_8bit
	    $E_jp $E_asc $E_kana);
BEGIN {
    $C_jis       = '\e\$[@B]([\x21-\x7e][\x21-\x7e])+';
    $C_jis_roman = '\e\([BJ][\s\x21-\x7e]*';
    $C_sjis      = '[\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc]';
    $C_sjis_kana = '[\xa1-\xdf]';
    $C_euc       = '[\xa1-\xfe][\xa1-\xfe]';
    $C_euc_kana  = '\x8e[\xa1-\xdf]';
    $C_SorE      = '[\xa1-\xdf]|[\x8e\xe0-\xfc][\xa1-\xfc]';
    $C_ascii     = '[\s\x21-\x7e]';
#   $C_ascii     = '[\x07\s\x21-\x7e]';	# for IRC freak :-)
    $C_pascii    = '[\x21-\x7e]';
    $C_tascii    = '[\x21\x23-\x27\x2a\x2b\x2d\x30-\x39\x41-\x5a\x5e-\x7e]';
    $C_sascii    = '[\x22\x28\x29\x2c\x2e\x2f\x3a-\x40\x5b-\x5d]';

    $C_8bit      = '[\x80-\xff]';

    ($E_jp, $E_asc, $E_kana) = ("\e\$B", "\e(B", "\e(I");
}

##### CODE CHECKER #####
#
# code_check($line, $use_hankaku_kana)
#	return value: encoding type
#		ascii
#		8bit
#		jis
#		euc
#		sjis
#		sORe
#
sub code_check ($;$) {
    my ($line, $no_hankaku_kana) = @_;
    my ($sjis, $euc);

    if ($line =~ /^$C_ascii*$/o) {
	return 'ascii';
    } elsif ($line =~ /$C_jis/o) {
	return 'jis';
    }

    if ($no_hankaku_kana) {
	$euc = 1 if $line =~ /^($C_ascii|$C_euc)+$/o;
	$sjis = 1 if $line =~ /^($C_ascii|$C_sjis)+$/o;
    } else {
	$euc = 1 if $line =~ /^($C_ascii|$C_euc|$C_euc_kana)+$/o;
	$sjis = 1 if $line =~ /^($C_ascii|$C_sjis|$C_sjis_kana)+$/o;
    }

    if ($euc && $sjis) {
	return 'sORe';
    } elsif ($euc) {
	return 'euc';
    } elsif ($sjis) {
	return 'sjis';
    }
    return '8bit';
}

##### BODY CODE CHECKER #####
#
# code_check_body(content)
#	content: pointer to body content line list
#	return value: encode type
#		ASCII
#		8BIT
#		JIS
#		EUC
#		SJIS
#
sub code_check_body ($) {
    my $content = shift;
    my (%count) = ();

    $count{'ascii'} = 0;	# for debug print
    $count{'8bit'} = 0;
    $count{'jis'} = 0;
    $count{'euc'} = 0;
    $count{'sjis'} = 0;
    $count{'sORe'} = 0;
    $count{'has8bit'} = 0;
    $count{'total'} = 0;

    my $i;
    for ($i = 0; $i <= $#$content; $i++) {
	$count{code_check($$content[$i])}++;
	my $line = $$content[$i];
	$count{'total'} += length($line);
	$line =~ s/[^\x80-\xff]//g;
	$count{'has8bit'} += length($line);
    }
    # select encoding
    if ($count{'has8bit'} * 8 > $count{'total'}) {
	$main::Need_base64_encoded = 1;
    } else {
	$main::Need_base64_encoded = 0;
    }
    if (&debug('code')) {
	im_debug("ascii = $count{'ascii'}\n");
	im_debug("8bit = $count{'8bit'}\n");
	im_debug("jis = $count{'jis'}\n");
	im_debug("euc = $count{'euc'}\n");
	im_debug("sjis = $count{'sjis'}\n");
	im_debug("sORe = $count{'sORe'}\n");
    }
    return '8BIT' if ($count{'8bit'});
    if ($count{'jis'}) {
	return '8BIT'
	    if ($count{'sORe'} || $count{'sjis'} || $count{'euc'});
	return 'JIS';
    }
    if ($count{'sjis'}) {
	return '8BIT' if ($count{'euc'});
	return 'SJIS';
    }
    return 'EUC' if ($count{'euc'});
    return uc($main::Default_code) if ($count{'sORe'});
    return 'ASCII';
}

##### CONVERT BODY INTO ISO-2022-JP ENCODING #####
#
# convert_iso2022jp_body(content)
#	content: pointer to body content line list
#	code: input kanji code
#	return value: none
#
sub convert_iso2022jp_body ($$) {
    my ($content, $code) = @_;

    my $i;
    for ($i = 0; $i <= $#$content; $i++) {
	$$content[$i] = conv_iso2022jp($$content[$i], $code);
    }
}

##### ISO-2022-JP CODE CONVERSION #####
#
# conv_iso2022jp(line)
#	line: a line of string to be converted
#	code: input kanji code
#	return value: converted line
#
sub conv_iso2022jp ($;$) {
    my ($line, $code) = @_;
    
    im_debug("conv_iso2022jp: $line\n") if (&debug('japanese'));

    if ($code eq 'NoHankana') {
	$code = uc(code_check($line, 1));
    } elsif (undefined($code)) {
	$code = uc(code_check($line));
    }
    $code = uc($main::Default_code) if ($code == 'SORE');

    if ($code eq 'ASCII') {
	im_debug("source is ascii\n") if (&debug('japanese'));
	return $line;
    } elsif ($code eq 'JIS') {
	im_debug("source is jis\n") if (&debug("japanese"));
	return $line;
    } elsif ($code eq 'EUC') {
	im_debug("source is euc\n") if (&debug('japanese'));
	return &conv_from_euc($line);
    } elsif ($code eq 'SJIS') {
	im_debug("source is sjis\n") if (&debug('japanese'));
	return &conv_from_sjis($line);
    }
    im_debug("source is unknown, nothing done\n") if (&debug('japanese'));
    return $line;
}

##### ISO-2022-JP CODE CONVERSION FROM SJIS #####
#
# conv_from_sjis(line)
#	line: a line of string to be converted
#	return value: converted line
#
sub conv_from_sjis ($) {
    my $line = shift;
    $line =~ s/(($C_sjis)+|($C_sjis_kana)+)/s2j($&, $')/geo;
    return $line;
}

# s2e() is taken from jcode.pl-1.9/2.3 by utashiro at example.com
######################################################################
#
# jcode.pl: Perl library for Japanese character code conversion
#
# Copyright (c) 1995,1996,1997 Kazumasa Utashiro <utashiro at example.com>
# Internet Initiative Japan Inc.
# 1-4 Sanban-cho, Chiyoda-ku, Tokyo 102, Japan
#
# Copyright (c) 1992,1993,1994 Kazumasa Utashiro
# Software Research Associates, Inc.
#
# Original version was developed under the name of srekcah at example.com
# February 1992 and it was called kconv.pl at the beginning.  This
# address was a pen name for group of individuals and it is no longer
# valid.
#
# Use and redistribution for any purpose, without significant
# modification, is granted as long as all copyright notices are
# retained.  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES ARE DISCLAIMED.
######################################################################
sub s2e ($) {
    my $code = shift;
    my ($c1, $c2) = unpack('CC', $code);
    if (0xa1 <= $c1 && $c1 <= 0xdf) {
	$c2 = $c1;
	$c1 = 0x8e;
    } elsif ($c2 >= 0x9f) {
	$c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe0 : 0x60);
	$c2 += 2;
    } else {
	$c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe1 : 0x61);
	$c2 += 0x60 + ($c2 < 0x7f);
    }
    return pack('CC', $c1, $c2);
}
# s2j() is based on _sjis2jis() of jcode.pl-1.9/2.0 by utashiro at example.com
sub s2j ($$) {
    my ($cur, $rest) = @_;
    if ($cur =~ /^($C_sjis_kana+)/o) {
	$cur =~ tr/\xa1-\xdf/\x21-\x5f/;
	return "$E_kana$cur" if ($rest =~ /^($C_sjis|$C_sjis_kana)/o);
	return "$E_kana$cur$E_asc";
    } else {
	$cur =~ s/(..)/s2e($1)/ge;
	$cur =~ tr/\xa1-\xfe/\x21-\x7e/;
	return "$E_jp$cur" if ($rest =~ /^($C_sjis|$C_sjis_kana)/o);
	return "$E_jp$cur$E_asc";
    }
}

##### ISO-2022-JP CODE CONVERSION FROM EUC #####
#
# conv_from_euc(line)
#	line: a line of string to be converted
#	return value: converted line
#
sub conv_from_euc ($) {
    my $line = shift;
    $line =~ s/(($C_euc)+|($C_euc_kana)+)/e2j($&, $')/geo;
    return $line;
}
# e2j() is based on _euc2jis() of jcode.pl-1.9/2.0 by utashiro at example.com
sub e2j ($$) {
    my ($cur, $rest) = @_;
    $cur =~ tr/\xa1-\xfe/\x21-\x7e/;
    if ($cur =~ /\x8e/) {
	$cur =~ tr/\x8e//d;
	return "$E_kana$cur" if ($rest =~ /^($C_euc|$C_euc_kana)/o);
	return "$E_kana$cur$E_asc";
    } else {
	return "$E_jp$cur" if ($rest =~ /^($C_euc|$C_euc_kana)/o);
	return "$E_jp$cur$E_asc";
    }
}

1;