[Mew-dist 05699] Re: code_check_body in Japanese.pm problem
SAKAI Kiyotaka
ksakai at example.com
1998年 7月 27日 (月) 20:43:30 JST
>> In article <19980727202200O.os at example.com>, OHARA Shigeki <os at example.com> writes:
> これは、code_check_body の問題のように思えるんですが、
> どうなんでしょうか?
さっき img に投げたのですが、Japanese.pm.in を以下のものと置き換えてみ
て下さい。
--
酒井 清隆 (E-mail: ksakai at example.com)
-------------- next part --------------
# -*-Perl-*-
################################################################
###
### Japanese.pm
###
### Copyright (C) 1997 Internet Message Group
###
### This Perl5 library conforms
### GNU GENERAL PUBLIC LICENSE Version 2.
###
###
### Author: Internet Message Group <img at example.com>
### Created: Apr 23, 1997
### Revised: @im_revised@
###
my $PM_VERSION = "IM::Japanese.pm @im_version@";
package IM::Japanese;
require 5.003;
require Exporter;
use IM::Util;
use integer;
use strict;
use vars qw(@ISA @EXPORT);
@ISA = qw(Exporter);
@EXPORT = qw(code_check code_check_body
convert_iso2022jp_body conv_iso2022jp);
=head1 NAME
Japanese - IM Japanese handler
=head1 SYNOPSIS
=head1 DESCRIPTION
=cut
use vars qw($C_jis $C_jis_roman $C_sjis $C_sjis_kana
$C_euc $C_euc_kana $C_SorE $C_ascii
$C_pascii $C_tascii $C_sascii $C_8bit
$E_jp $E_asc $E_kana);
BEGIN {
$C_jis = '\e\$[@B]([\x21-\x7e][\x21-\x7e])+';
$C_jis_roman = '\e\([BJ][\s\x21-\x7e]*';
$C_sjis = '[\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc]';
$C_sjis_kana = '[\xa1-\xdf]';
$C_euc = '[\xa1-\xfe][\xa1-\xfe]';
$C_euc_kana = '\x8e[\xa1-\xdf]';
$C_SorE = '[\xa1-\xdf]|[\x8e\xe0-\xfc][\xa1-\xfc]';
$C_ascii = '[\s\x21-\x7e]';
# $C_ascii = '[\x07\s\x21-\x7e]'; # for IRC freak :-)
$C_pascii = '[\x21-\x7e]';
$C_tascii = '[\x21\x23-\x27\x2a\x2b\x2d\x30-\x39\x41-\x5a\x5e-\x7e]';
$C_sascii = '[\x22\x28\x29\x2c\x2e\x2f\x3a-\x40\x5b-\x5d]';
$C_8bit = '[\x80-\xff]';
($E_jp, $E_asc, $E_kana) = ("\e\$B", "\e(B", "\e(I");
}
##### CODE CHECKER #####
#
# code_check($line, $use_hankaku_kana)
# return value: encoding type
# ascii
# 8bit
# jis
# euc
# sjis
# sORe
#
sub code_check ($;$) {
my ($line, $no_hankaku_kana) = @_;
my ($sjis, $euc);
if ($line =~ /^$C_ascii*$/o) {
return 'ascii';
} elsif ($line =~ /$C_jis/o) {
return 'jis';
}
if ($no_hankaku_kana) {
$euc = 1 if $line =~ /^($C_ascii|$C_euc)+$/o;
$sjis = 1 if $line =~ /^($C_ascii|$C_sjis)+$/o;
} else {
$euc = 1 if $line =~ /^($C_ascii|$C_euc|$C_euc_kana)+$/o;
$sjis = 1 if $line =~ /^($C_ascii|$C_sjis|$C_sjis_kana)+$/o;
}
if ($euc && $sjis) {
return 'sORe';
} elsif ($euc) {
return 'euc';
} elsif ($sjis) {
return 'sjis';
}
return '8bit';
}
##### BODY CODE CHECKER #####
#
# code_check_body(content)
# content: pointer to body content line list
# return value: encode type
# ASCII
# 8BIT
# JIS
# EUC
# SJIS
#
sub code_check_body ($) {
my $content = shift;
my (%count) = ();
$count{'ascii'} = 0; # for debug print
$count{'8bit'} = 0;
$count{'jis'} = 0;
$count{'euc'} = 0;
$count{'sjis'} = 0;
$count{'sORe'} = 0;
$count{'has8bit'} = 0;
$count{'total'} = 0;
my $i;
for ($i = 0; $i <= $#$content; $i++) {
$count{code_check($$content[$i])}++;
my $line = $$content[$i];
$count{'total'} += length($line);
$line =~ s/[^\x80-\xff]//g;
$count{'has8bit'} += length($line);
}
# select encoding
if ($count{'has8bit'} * 8 > $count{'total'}) {
$main::Need_base64_encoded = 1;
} else {
$main::Need_base64_encoded = 0;
}
if (&debug('code')) {
im_debug("ascii = $count{'ascii'}\n");
im_debug("8bit = $count{'8bit'}\n");
im_debug("jis = $count{'jis'}\n");
im_debug("euc = $count{'euc'}\n");
im_debug("sjis = $count{'sjis'}\n");
im_debug("sORe = $count{'sORe'}\n");
}
return '8BIT' if ($count{'8bit'});
if ($count{'jis'}) {
return '8BIT'
if ($count{'sORe'} || $count{'sjis'} || $count{'euc'});
return 'JIS';
}
if ($count{'sjis'}) {
return '8BIT' if ($count{'euc'});
return 'SJIS';
}
return 'EUC' if ($count{'euc'});
return uc($main::Default_code) if ($count{'sORe'});
return 'ASCII';
}
##### CONVERT BODY INTO ISO-2022-JP ENCODING #####
#
# convert_iso2022jp_body(content)
# content: pointer to body content line list
# code: input kanji code
# return value: none
#
sub convert_iso2022jp_body ($$) {
my ($content, $code) = @_;
my $i;
for ($i = 0; $i <= $#$content; $i++) {
$$content[$i] = conv_iso2022jp($$content[$i], $code);
}
}
##### ISO-2022-JP CODE CONVERSION #####
#
# conv_iso2022jp(line)
# line: a line of string to be converted
# code: input kanji code
# return value: converted line
#
sub conv_iso2022jp ($;$) {
my ($line, $code) = @_;
im_debug("conv_iso2022jp: $line\n") if (&debug('japanese'));
if ($code eq 'NoHankana') {
$code = uc(code_check($line, 1));
} elsif (undefined($code)) {
$code = uc(code_check($line));
}
$code = uc($main::Default_code) if ($code == 'SORE');
if ($code eq 'ASCII') {
im_debug("source is ascii\n") if (&debug('japanese'));
return $line;
} elsif ($code eq 'JIS') {
im_debug("source is jis\n") if (&debug("japanese"));
return $line;
} elsif ($code eq 'EUC') {
im_debug("source is euc\n") if (&debug('japanese'));
return &conv_from_euc($line);
} elsif ($code eq 'SJIS') {
im_debug("source is sjis\n") if (&debug('japanese'));
return &conv_from_sjis($line);
}
im_debug("source is unknown, nothing done\n") if (&debug('japanese'));
return $line;
}
##### ISO-2022-JP CODE CONVERSION FROM SJIS #####
#
# conv_from_sjis(line)
# line: a line of string to be converted
# return value: converted line
#
sub conv_from_sjis ($) {
my $line = shift;
$line =~ s/(($C_sjis)+|($C_sjis_kana)+)/s2j($&, $')/geo;
return $line;
}
# s2e() is taken from jcode.pl-1.9/2.3 by utashiro at example.com
######################################################################
#
# jcode.pl: Perl library for Japanese character code conversion
#
# Copyright (c) 1995,1996,1997 Kazumasa Utashiro <utashiro at example.com>
# Internet Initiative Japan Inc.
# 1-4 Sanban-cho, Chiyoda-ku, Tokyo 102, Japan
#
# Copyright (c) 1992,1993,1994 Kazumasa Utashiro
# Software Research Associates, Inc.
#
# Original version was developed under the name of srekcah at example.com
# February 1992 and it was called kconv.pl at the beginning. This
# address was a pen name for group of individuals and it is no longer
# valid.
#
# Use and redistribution for any purpose, without significant
# modification, is granted as long as all copyright notices are
# retained. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES ARE DISCLAIMED.
######################################################################
sub s2e ($) {
my $code = shift;
my ($c1, $c2) = unpack('CC', $code);
if (0xa1 <= $c1 && $c1 <= 0xdf) {
$c2 = $c1;
$c1 = 0x8e;
} elsif ($c2 >= 0x9f) {
$c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe0 : 0x60);
$c2 += 2;
} else {
$c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe1 : 0x61);
$c2 += 0x60 + ($c2 < 0x7f);
}
return pack('CC', $c1, $c2);
}
# s2j() is based on _sjis2jis() of jcode.pl-1.9/2.0 by utashiro at example.com
sub s2j ($$) {
my ($cur, $rest) = @_;
if ($cur =~ /^($C_sjis_kana+)/o) {
$cur =~ tr/\xa1-\xdf/\x21-\x5f/;
return "$E_kana$cur" if ($rest =~ /^($C_sjis|$C_sjis_kana)/o);
return "$E_kana$cur$E_asc";
} else {
$cur =~ s/(..)/s2e($1)/ge;
$cur =~ tr/\xa1-\xfe/\x21-\x7e/;
return "$E_jp$cur" if ($rest =~ /^($C_sjis|$C_sjis_kana)/o);
return "$E_jp$cur$E_asc";
}
}
##### ISO-2022-JP CODE CONVERSION FROM EUC #####
#
# conv_from_euc(line)
# line: a line of string to be converted
# return value: converted line
#
sub conv_from_euc ($) {
my $line = shift;
$line =~ s/(($C_euc)+|($C_euc_kana)+)/e2j($&, $')/geo;
return $line;
}
# e2j() is based on _euc2jis() of jcode.pl-1.9/2.0 by utashiro at example.com
sub e2j ($$) {
my ($cur, $rest) = @_;
$cur =~ tr/\xa1-\xfe/\x21-\x7e/;
if ($cur =~ /\x8e/) {
$cur =~ tr/\x8e//d;
return "$E_kana$cur" if ($rest =~ /^($C_euc|$C_euc_kana)/o);
return "$E_kana$cur$E_asc";
} else {
return "$E_jp$cur" if ($rest =~ /^($C_euc|$C_euc_kana)/o);
return "$E_jp$cur$E_asc";
}
}
1;
Mew-dist メーリングリストの案内