ngram 類似度 Ruby
無駄なことやりまくる
require "enumerator"
def ngram(data, n)
ret = []
data.split(//u).each_cons(n) do |a|
ret << a.join
end
ret
end
def sim(a, b, n)
agram = ngram(a, n)
bgram = ngram(b, n)
all = (agram | bgram).size.to_f
same = (agram & bgram).size.to_f
same / all
end
require "rexml/document"
require "open-uri"
include REXML
entries = []
doc = Document.new(open("http://d.hatena.ne.jp/cho45/rss").read)
doc.elements.each("//item") do |e|
title = e.elements["title"].text
link = e.elements["link"].text
text = e.elements["content:encoded"].text
text.gsub!(/<[^>]+>/, "")
text.gsub!(/¥s+/, " ")
entries << {
:title => title,
:link => link,
:text => text,
}
end
results = {}
entries.each do |a|
entries.each do |b|
next if a[:link] == b[:link]
key = [a[:link], b[:link]].sort
next if results[key]
results[key] = {
:a => a,
:b => b,
:sim => sim(a[:text], b[:text], 3)
}
end
end
results.sort_by {|k,i| i[:sim] }.each do |k,v|
puts "%s: %s" % [v[:a][:link], v[:a][:title]]
puts "%s: %s" % [v[:b][:link], v[:b][:title]]
puts v[:sim]
puts
end