[Mew-dist 10313] Re: Japanese nickname in Addrbook

Kazumasa Utashiro utashiro at example.com
1999年 8月 21日 (土) 01:21:35 JST


From: Kazu Yamamoto (山本和彦) <kazu at example.com>
Subject: [Mew-dist 10285] Re: Japanese nickname in Addrbook
Date: Fri, 20 Aug 1999 17:34:39 +0900

> > IM の初期の頃の議論で $&, $`, $' をプログラム中で 1回でも使用すると、
> > パターンマッチごとに $&, $`, $' に対応する文字列をコピーして持つように
> > なるためパフォーマンスが悪くなるという話だったので、$&, $`, $' は敢え
> > て使わないようになっていたのですが、実際のところ、どうなんでしょうね。
> 
> Perl ハッカーで「正規表現」監訳者である歌代さん(上司ともいう)が添削し
> て下さることになりましたので、125 では直ると思います。

im-124 の中を探すと、これだけみつかります。

./IM/Japanese.pm:226:    $line =~ s/(($C_sjis)+|($C_sjis_kana)+)/s2j($&, $')/geo;
./IM/Japanese.pm:290:    $line =~ s/(($C_euc)+|($C_euc_kana)+)/e2j($&, $')/geo;
./IM/Japanese.pm:316:    $line =~ s/$C_sjis|$C_sjis_kana/s2e($&)/geo;  
./IM/Scan.pm:895:		$_ = $';
./imput.in:414:	    &add_header(\@Header, 0, $1, $');
./imsetup.in:208:		$mhdir = $mail =  $';
./imsetup.in:212:		$mhalias = $';
./imsetup.in:215:		$mhalias = $';
./imsetup.in:218:		$address = $';
./imsetup.in:291:	$domain = $';

imput と imsetup は影響範囲が狭いのでとりあえず放置することにして、
Japanese.pm と Scan.pm を修正しました。

From: Motonori Nakamura <motonori at example.com>
Subject: Re: naughtiness check sub
Date: Fri, 22 Aug 1997 09:20:51 +0900

> sub conv_from_euc {
>     my $line = shift;
>     while ($line =~ /(.*)($C_euc|$C_euc_kana)(.*)/s) {
>         $line = $1 . &e2j($2, $5) . $5;
>     }
>     return $line;
> }   
> 
> なんてすると、コピーが発生しまくって、「生麦生米生卵」という
> 短い文字列に対してでさえ 10倍遅くなってしまうのですが、
> 何か妙案はあるでしょうか?

については jcode.pl と同様に、処理のレベルを一段増やすことで対応しまし
た。Perl5 を前提とすれば、lookahead を使って今までの構造を維持すること
もできるのですが、それがそれほど優れた方法とも思えないのでやめました。

ついでに jcode.pl の copyright もうるさいのでとっちゃいました。

Scan.pm の方は定石ね。

実は eyeball-check しかしていないので、誰かテストしてください。

--utashiro

-------------- next part --------------
*** IM/Japanese.pm.bak	Fri Aug 20 15:15:03 1999
--- IM/Japanese.pm	Sat Aug 21 00:28:42 1999
***************
*** 223,254 ****
  #
  sub conv_from_sjis ($) {
      my $line = shift;
!     $line =~ s/(($C_sjis)+|($C_sjis_kana)+)/s2j($&, $')/geo;
      return $line;
  }
- 
- # s2e() is taken from jcode.pl-1.9/2.3 by utashiro at example.com
- ######################################################################
- #
- # jcode.pl: Perl library for Japanese character code conversion
- #
- # Copyright (c) 1995,1996,1997 Kazumasa Utashiro <utashiro at example.com>
- # Internet Initiative Japan Inc.
- # 1-4 Sanban-cho, Chiyoda-ku, Tokyo 102, Japan
- #
- # Copyright (c) 1992,1993,1994 Kazumasa Utashiro
- # Software Research Associates, Inc.
- #
- # Original version was developed under the name of srekcah at example.com
- # February 1992 and it was called kconv.pl at the beginning.  This
- # address was a pen name for group of individuals and it is no longer
- # valid.
- #
- # Use and redistribution for any purpose, without significant
- # modification, is granted as long as all copyright notices are
- # retained.  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
- # ANY EXPRESS OR IMPLIED WARRANTIES ARE DISCLAIMED.
- ######################################################################
  sub s2e ($) {
      my $code = shift;
      my ($c1, $c2) = unpack('CC', $code);
--- 223,236 ----
  #
  sub conv_from_sjis ($) {
      my $line = shift;
!     $line =~ s/((?:$C_sjis|$C_sjis_kana)+)/sjis2jis($1).$E_asc/geo;
!     return $line;
! }
! sub sjis2jis ($) {
!     my $line = shift;
!     $line =~ s/((?:$C_sjis)+|(?:$C_sjis_kana)+)/s2j($1)/geo;
      return $line;
  }
  sub s2e ($) {
      my $code = shift;
      my ($c1, $c2) = unpack('CC', $code);
***************
*** 264,281 ****
      }
      return pack('CC', $c1, $c2);
  }
! # s2j() is based on _sjis2jis() of jcode.pl-1.9/2.0 by utashiro at example.com
! sub s2j ($$) {
!     my ($cur, $rest) = @_;
!     if ($cur =~ /^($C_sjis_kana+)/o) {
  	$cur =~ tr/\xa1-\xdf/\x21-\x5f/;
! 	return "$E_kana$cur" if ($rest =~ /^($C_sjis|$C_sjis_kana)/o);
! 	return "$E_kana$cur$E_asc";
      } else {
  	$cur =~ s/(..)/s2e($1)/ge;
  	$cur =~ tr/\xa1-\xfe/\x21-\x7e/;
! 	return "$E_jp$cur" if ($rest =~ /^($C_sjis|$C_sjis_kana)/o);
! 	return "$E_jp$cur$E_asc";
      }
  }
  
--- 246,260 ----
      }
      return pack('CC', $c1, $c2);
  }
! sub s2j ($) {
!     my $cur = @_;
!     if ($cur =~ /^$C_sjis_kana/o) {
  	$cur =~ tr/\xa1-\xdf/\x21-\x5f/;
! 	return "$E_kana$cur";
      } else {
  	$cur =~ s/(..)/s2e($1)/ge;
  	$cur =~ tr/\xa1-\xfe/\x21-\x7e/;
! 	return "$E_jp$cur";
      }
  }
  
***************
*** 287,306 ****
  #
  sub conv_from_euc ($) {
      my $line = shift;
!     $line =~ s/(($C_euc)+|($C_euc_kana)+)/e2j($&, $')/geo;
      return $line;
  }
- # e2j() is based on _euc2jis() of jcode.pl-1.9/2.0 by utashiro at example.com
  sub e2j ($$) {
!     my ($cur, $rest) = @_;
      $cur =~ tr/\xa1-\xfe/\x21-\x7e/;
      if ($cur =~ /\x8e/) {
  	$cur =~ tr/\x8e//d;
! 	return "$E_kana$cur" if ($rest =~ /^($C_euc|$C_euc_kana)/o);
! 	return "$E_kana$cur$E_asc";
      } else {
! 	return "$E_jp$cur" if ($rest =~ /^($C_euc|$C_euc_kana)/o);
! 	return "$E_jp$cur$E_asc";
      }
  }
  
--- 266,287 ----
  #
  sub conv_from_euc ($) {
      my $line = shift;
!     $line =~ s/((?:$C_euc|$C_euc_kana)+)/euc2jis($1).$E_asc/geo;
!     return $line;
! }
! sub euc2jis ($) {
!     my $line = shift;
!     $line =~ s/((?:$C_euc)+|(?:$C_euc_kana)+)/e2j($1)/geo;
      return $line;
  }
  sub e2j ($$) {
!     my $cur = @_;
      $cur =~ tr/\xa1-\xfe/\x21-\x7e/;
      if ($cur =~ /\x8e/) {
  	$cur =~ tr/\x8e//d;
! 	return "$E_kana$cur";
      } else {
! 	return "$E_jp$cur";
      }
  }
  
***************
*** 313,319 ****
  
  sub conv_euc_from_sjis ($) {
      my $line = shift;
!     $line =~ s/$C_sjis|$C_sjis_kana/s2e($&)/geo;  
      return $line;
  }
  
--- 294,300 ----
  
  sub conv_euc_from_sjis ($) {
      my $line = shift;
!     $line =~ s/($C_sjis|$C_sjis_kana)/s2e($1)/geo;  
      return $line;
  }
  
*** IM/Scan.pm.bak	Fri Aug 20 15:15:04 1999
--- IM/Scan.pm	Sat Aug 21 00:20:28 1999
***************
*** 888,898 ****
  	    $_ = $line;
  	    s/"([^"]+)"/w2n($1)/geo;
              s/,\s+/,/g;
! 	    if (/^(\S+)\s+(\S+)\s+(\S+)/) {
  		$key = $1;
!                 $addr = $2;
!                 $petname = $3;
! 		$_ = $';
  		next if ($key =~ /:$/);
  	        next if $petname eq '*';
  	    } else {
--- 888,897 ----
  	    $_ = $line;
  	    s/"([^"]+)"/w2n($1)/geo;
              s/,\s+/,/g;
! 	    if (s/^(\S+)\s+(\S+)\s+(\S+)//) {
  		$key = $1;
! 		$addr = $2;
! 		$petname = $3;
  		next if ($key =~ /:$/);
  	        next if $petname eq '*';
  	    } else {


Mew-dist メーリングリストの案内