2012-01-18 :-(
_ tdiary grep.rb で invalid multibyte character (1)
青木さんのアレ http://i.loveruby.net/svn/public/tdiarytools/trunk/grep.rb
% ruby --version ruby 1.9.2p180 (2011-02-18 revision 30909) [i386-netbsdelf]
UTF-8 対応と ruby 1.9 対応してみる。あといわゆるデバッグprint を追加。
--- C:/Users/rin/Desktop/grep.rb.orig Wed Jan 18 23:21:35 2012 +++ C:/Users/rin/Desktop/grep.rb Wed Jan 18 23:19:57 2012 @@ -1,4 +1,5 @@ #!/usr/bin/env ruby +# -*- coding: utf-8 -*- # # $Id$ # @@ -26,7 +27,7 @@ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> <html lang="ja-JP"> <head> - <meta http-equiv="Content-Type" content="text/html; charset=euc-jp"> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <meta http-equiv="Content-Language" content="ja-JP"> <meta name="robots" content="none"> <title>tDiary Grep</title> @@ -115,12 +116,12 @@ begin Uconv.u8toeuc(str) rescue Uconv::Error - NKF::nkf('-e -m0', str) + NKF::nkf('-w -m0', str) end end rescue LoadError def to_euc(str) - NKF::nkf('-e -m0', str) + NKF::nkf('-w -m0', str) end end @@ -135,10 +136,10 @@ Z_SPACE = "\241\241" # zen-kaku space -BEGIN { $defout.binmode } +BEGIN { $stdout.binmode } def main - $KCODE = 'EUC' +# $KCODE = 'UTF8' cgi = CGI.new html = '<html><head><title></title></head><body><p>error</p></body></html>' begin @@ -154,11 +155,15 @@ begin begin if LOGGING and File.file?(query_log()) and cgi.valid?('history') + puts "history_page" return history_page() elsif not cgi.valid?('q') + puts "search_form_page()" return search_form_page() else - query = to_euc([cgi.params['q']].compact.flatten.join(' ')) + puts "else" +# query = to_euc([cgi.params['q']].compact.flatten.join(' ')) + query = [cgi.params['q']].compact.flatten.join(' ') html = search_result_page(setup_patterns(query)) save_query(query, query_log()) if LOGGING return html @@ -183,7 +188,7 @@ def send_html(cgi, html) print cgi.header('status' => '200 OK', 'type' => 'text/html', - 'charset' => 'euc-jp', + 'charset' => 'UTF-8', 'Content-Length' => html.length.to_s, 'Cache-Control' => 'no-cache', 'Pragma' => 'no-cache') @@ -191,9 +196,10 @@ end def setup_patterns(query) + puts "setup_patterns" patterns = split_string(query).map {|pat| check_pattern pat - /#{Regexp.quote(pat)}/ie + /#{Regexp.quote(pat)}/iu } raise WrongQuery, 'no pattern' if patterns.empty? raise WrongQuery, 'too many sub patterns' if patterns.length > 8 @@ -201,6 +207,7 @@ end def check_pattern(pat) + puts "check_pattern" raise WrongQuery, 'no pattern' unless pat raise WrongQuery, 'empty pattern' if pat.empty? raise WrongQuery, "pattern too short: #{pat}" if pat.length < 2 @@ -208,7 +215,11 @@ end def split_string(str) - str.split(/[\s#{Z_SPACE}]+/oe).reject {|w| w.empty? } + puts "split_string" + puts NKF.guess(str) + puts str.encoding + puts __ENCODING__ + str.split(/[\s#{Z_SPACE}]+/ou).reject {|w| w.empty? } end def save_query(query, file) @@ -247,6 +258,7 @@ end def search_result_page(patterns) + puts "search_result_page" ERB.new(HEADER + SEARCH_RESULT + FOOTER).result(binding()) end @@ -398,7 +410,7 @@ title, body = @source.split(/\n/, 2) sprintf('%-30s | %s', title.to_s.strip, - remove_tags(body.to_s).gsub(/[\s#{Z_SPACE}]+/oe, ' ').slice(/\A.{0,60}/me)) + remove_tags(body.to_s).gsub(/[\s#{Z_SPACE}]+/ou, ' ').slice(/\A.{0,60}/me)) end private
コマンドラインから実行。
% cd ~/public_html/diary % ./grep.rb (offline mode: enter name=value pairs on standard input) q=hoge[Ctrl+D] <==== クエリ else setup_patterns split_string <==== split_string() まで来てる US-ASCII <==== NKF.guess(str) UTF-8 <==== str.encoding UTF-8 <==== __ENCODING__ Status: 200 OK Content-Type: text/html; charset=UTF-8 Content-Length: 362 Cache-Control: no-cache Pragma: no-cache <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> <html lang="ja-JP"> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <meta http-equiv="Content-Language" content="ja-JP"> <meta name="robots" content="none"> <title>tDiary Grep</title> </head> <body> <pre> q=hoge invalid multibyte character <==== 例外 </pre> </body> </html>
machu さんところなど読んだけど力尽きた。
[ツッコミを入れる]