2007年 11月 20日

一部のHTML要素のみ通すフィルタ

ちがうちがうおれはなにをやっているんだ……

require "strscan"

# HTMLFilter
# 許した要素のみ残しながら、タグの対応を補完する
class HTMLFilter
	ESCAPE = { '<'  => '&lt;', '>'  => '&gt;', '"' => '&quot;' }

	EMPTY_ELEMENTS = ["br", "hr", "img"]

	def initialize(allow, callback=proc {|x| x})
		@allow = allow
		@callback = callback
	end

	def filter(input)
		ret = ""
		pool = ""
		s = StringScanner.new(input)
		elements = []
		until s.eos?
			if s.scan(%r{<(/)?(#{@allow.keys.join("|")})}i)
				ret << @callback[escape(pool)]
				pool = ""
				name = s[2]
				if s[1]
					# end tag
					s.scan(%r{¥s*>})
					while opened_but_close = elements.pop
						ret << "</#{opened_but_close}>"
						break if name == opened_but_close
					end
					# remove the end tag if it was not opened.
				else
					# start tag
					attrs = []
					while s.scan(%r{¥s+([a-z-]+)=(?:"([^"]*)"|'([^']*)')}i)
						an = s[1]
						av = s[2] || s[3]
						attrs << "#{an}='#{escape(av)}'" if @allow[name].include?(an)
					end
					attrs = attrs.empty?? "" : " #{attrs.join(" ")}"
					empty = false
					if s.scan(%r{¥s*(/)?>})
						case
						when EMPTY_ELEMENTS.include?(name)
							empty = true
							ret << "<#{name}#{attrs} />"
						when s[1]
							empty = true
							ret << "<#{name}#{attrs}></#{name}>"
						else
							ret << "<#{name}#{attrs}>"
						end
					else
						# invalid but continue
						ret << "<#{name}#{attrs}>"
					end
					elements.push(name) unless empty
				end
			else
				pool << s.getch
			end
		end
		ret << @callback[escape(pool)]
		ret << "</#{opened_but_close}>" while (opened_but_close = elements.pop)
		ret
	end

	def escape(str)
		str.gsub(/#{ESCAPE.keys.join("|")}/) {|m|
			ESCAPE[m]
		}
	end
end

opts = {
	"a"      => ["href", "name"],
	"strong" => [],
	"br"     => [],
	"p"      => [],
	"ins"    => ["datetime"],
	"del"    => ["datetime"],
}

inputs = DATA.read
out = HTMLFilter.new(opts, proc {|str|
	str.gsub(/¥n/, "<br />¥n")
}).filter(inputs)

puts out



__END__
<script foo="<script>alert('bar')</script>">alert('foo')</script>
<script foo="<a href='link'>link</a>">alert('foo')</script>
<a href='www.g>oogle.com'>link</a>
<a href="hoge" name="aaa">hoge<strong style="">ttt</strong></a>
<a href="hoge" name="aa
{a">hoge<br><br/></a>

<ins datetime="">
<p>
aaa
</p>

<p>aaa

</ins>

<a></strong>
<p>aaa

どう書く?org のお題 をやっていたんだった。なんかだんだんズレてきたので投稿がためらわれる。改行を br にしろというお題 に対応ずみ。