2007年 10月 18日

ngram 類似度 Ruby

無駄なことやりまくる

require "enumerator"

def ngram(data, n)
	ret = []
	data.split(//u).each_cons(n) do |a|
		ret << a.join
	end
	ret
end

def sim(a, b, n)
	agram = ngram(a, n)
	bgram = ngram(b, n)

	all  = (agram | bgram).size.to_f
	same = (agram & bgram).size.to_f

	same / all
end


require "rexml/document"
require "open-uri"
include REXML

entries = []
doc = Document.new(open("http://d.hatena.ne.jp/cho45/rss").read)
doc.elements.each("//item") do |e|
	title = e.elements["title"].text
	link  = e.elements["link"].text
	text = e.elements["content:encoded"].text
	text.gsub!(/<[^>]+>/, "")
	text.gsub!(/¥s+/, " ")
	entries << {
		:title => title,
		:link  => link,
		:text  => text,
	}
end

results = {}
entries.each do |a|
	entries.each do |b|
		next if a[:link] == b[:link]
		key = [a[:link], b[:link]].sort
		next if results[key]
		results[key] = {
			:a => a,
			:b => b,
			:sim => sim(a[:text], b[:text], 3)
		}
	end
end
results.sort_by {|k,i| i[:sim] }.each do |k,v|
	puts "%s: %s" % [v[:a][:link], v[:a][:title]]
	puts "%s: %s" % [v[:b][:link], v[:b][:title]]
	puts v[:sim]
	puts
end