User talk:Tapin/CategorizationBot.rb
From Geo Hashing
#!/usr/bin/ruby
require 'uri'
require 'net/http'
require 'rexml/document'
class XkcdUrl
attr_accessor :host
attr_accessor :path
attr_accessor :queryParams
def initialize(host = 'wiki.xkcd.com', path = '/wgh/api.php')
@host = host
@path = path
end
def buildUrl
if !@queryParams.nil?
queryAry = []
queryParams.each_pair{|k, v| queryAry.push "#{k}=#{v}"}
url = URI::HTTP.build({:host => @host, :path => @path, :query => queryAry.join('&')})
else
url = URI::HTTP.build({:host => @host, :path => @path})
end
return url
end
end
# a) Class for category retrieval
class MeetupOn
attr_accessor :date
#TODO: Need parent w/host,path etc
def initialize(date = "2008-06-10") #TODO: Change this to default to today
@date = date
end
def getUrl
end
end
# b) Class for category member retrieval
# c) Class for images linked from (b)
def getMeetupCategories
wiki = XkcdUrl.new
wiki.queryParams = {}
wiki.queryParams["action"] = "query"
wiki.queryParams["list"] = "allcategories"
wiki.queryParams["acprefix"] = "Meetup_on"
wiki.queryParams["prop"] = "info"
wiki.queryParams["format"] = "xml"
wiki.queryParams["aclimit"] = "500"
#TODO: This will only retrieve the first 500. We need to be able to page through as
# necessary, using the acfrom parameter
# (taken from prev result xpath api/query-continue/allcategories, I think)
url = wiki.buildUrl
r = Net::HTTP.get_response(url)
if (r.code != "200")
return []
end
cats = []
xml = REXML::Document.new r.body
xml.elements.each("api/query/allcategories/c") { |elt| cats.push elt.get_text.to_s }
return cats
end
def getPagesFromCategory(category)
wiki = XkcdUrl.new
wiki.queryParams = {}
wiki.queryParams["action"] = "query"
wiki.queryParams["list"] = "categorymembers"
wiki.queryParams["cmtitle"] = "Category:" + category.to_s.tr(' ', '_')
wiki.queryParams["cmlimit"] = "500"
wiki.queryParams["format"] = "xml"
wiki.queryParams["cmnamespace"] = 0 # Only get real pages -- 6 is 'Image:'
url = wiki.buildUrl
r = Net::HTTP.get_response(url)
if (r.code != "200")
return []
end
pages = []
xml = REXML::Document.new r.body
xml.elements.each("api/query/categorymembers/cm") { |elt| pages.push elt.attributes['title'].to_s }
return pages
end
def getImagesFromMeetup(meetup)
wiki = XkcdUrl.new
wiki.queryParams = {}
wiki.queryParams["action"] = "query"
wiki.queryParams["titles"] = meetup.to_s.tr(' ', '_')
wiki.queryParams["prop"] = "images"
wiki.queryParams["format"] = "xml"
url = wiki.buildUrl
r = Net::HTTP.get_response(url)
if (r.code != "200")
return []
end
images = []
xml = REXML::Document.new r.body
xml.elements.each("api/query/pages/page/images/im") { |elt| images.push elt.attributes['title'].to_s }
return images
end
def getCategoriesFromImage(image)
wiki = XkcdUrl.new
wiki.queryParams = {}
wiki.queryParams["action"] = "query"
wiki.queryParams["titles"] = image.to_s.tr(' ', '_')
wiki.queryParams["prop"] = "categories"
wiki.queryParams["format"] = "xml"
url = wiki.buildUrl
r = Net::HTTP.get_response(url)
if (r.code != "200")
return []
end
cats = []
xml = REXML::Document.new r.body
xml.elements.each("api/query/pages/page/categories/cl") { |elt| cats.push elt.attributes['title'].to_s }
return cats
end
def main
cats = getMeetupCategories
sleep 1
cats.each do |cat|
puts "Category:#{cat}"
pages = getPagesFromCategory(cat)
sleep 1
pages.each do |page|
puts " #{page}"
images = getImagesFromMeetup(page)
sleep 1
images.each do |image|
puts " #{image}"
imageCats = getCategoriesFromImage(image)
sleep 1
if imageCats.member? "Category:#{cat}"
puts "Found an image that _is_ categorized: #{image} in #{cat}"
end
end
end
end
end
main