User talk:Tapin/CategorizationBot.rb

From Geohashing
#!/usr/bin/ruby

require 'uri'
require 'net/http'
require 'rexml/document'

class XkcdUrl
  attr_accessor :host
  attr_accessor :path
  attr_accessor :queryParams
  
  def initialize(host = 'geohashing.site', path = '/api.php')
    @host = host
    @path = path
  end

  def buildUrl
    if !@queryParams.nil?
      queryAry = []
      queryParams.each_pair{|k, v| queryAry.push "#{k}=#{v}"}
      url = URI::HTTP.build({:host => @host, :path => @path, :query => queryAry.join('&')})
    else
      url = URI::HTTP.build({:host => @host, :path => @path})
    end
    
    return url
  end
end

# a) Class for category retrieval
class MeetupOn
  attr_accessor :date
  #TODO: Need parent w/host,path etc
  
  def initialize(date = "2008-06-10") #TODO: Change this to default to today
    @date = date
  end

  def getUrl
    
  end
end

# b) Class for category member retrieval
# c) Class for images linked from (b)

def getMeetupCategories
  wiki = XkcdUrl.new
  wiki.queryParams = {}
  wiki.queryParams["action"] = "query"
  wiki.queryParams["list"] = "allcategories"
  wiki.queryParams["acprefix"] = "Meetup_on"
  wiki.queryParams["prop"] = "info"
  wiki.queryParams["format"] = "xml"
  wiki.queryParams["aclimit"] = "500"

  #TODO: This will only retrieve the first 500.  We need to be able to page through as
  # necessary, using the acfrom parameter
  # (taken from prev result xpath api/query-continue/allcategories, I think)
  url = wiki.buildUrl
  r = Net::HTTP.get_response(url)
  if (r.code != "200")
    return []
  end

  cats = []
  xml = REXML::Document.new r.body
  xml.elements.each("api/query/allcategories/c") { |elt| cats.push elt.get_text.to_s }
  return cats
end

def getPagesFromCategory(category)
  wiki = XkcdUrl.new
  wiki.queryParams = {}
  wiki.queryParams["action"] = "query"
  wiki.queryParams["list"] = "categorymembers"
  wiki.queryParams["cmtitle"] = "Category:" + category.to_s.tr(' ', '_')
  wiki.queryParams["cmlimit"] = "500"  
  wiki.queryParams["format"] = "xml"
  wiki.queryParams["cmnamespace"] = 0 # Only get real pages -- 6 is 'Image:'
  
  url = wiki.buildUrl
  r = Net::HTTP.get_response(url)
  if (r.code != "200")
    return []
  end

  pages = []
  xml = REXML::Document.new r.body
  xml.elements.each("api/query/categorymembers/cm") { |elt| pages.push elt.attributes['title'].to_s }
  return pages
end

def getImagesFromMeetup(meetup)
  wiki = XkcdUrl.new
  wiki.queryParams = {}
  wiki.queryParams["action"] = "query"
  wiki.queryParams["titles"] = meetup.to_s.tr(' ', '_')
  wiki.queryParams["prop"] = "images"
  wiki.queryParams["format"] = "xml"

  url = wiki.buildUrl
  r = Net::HTTP.get_response(url)
  if (r.code != "200")
    return []
  end

  images = []
  xml = REXML::Document.new r.body
  xml.elements.each("api/query/pages/page/images/im") { |elt| images.push elt.attributes['title'].to_s }
  return images
end

def getCategoriesFromImage(image)
  wiki = XkcdUrl.new
  wiki.queryParams = {}
  wiki.queryParams["action"] = "query"
  wiki.queryParams["titles"] = image.to_s.tr(' ', '_')
  wiki.queryParams["prop"] = "categories"
  wiki.queryParams["format"] = "xml"  
  
  url = wiki.buildUrl
  r = Net::HTTP.get_response(url)
  if (r.code != "200")
    return []
  end

  cats = []
  xml = REXML::Document.new r.body
  xml.elements.each("api/query/pages/page/categories/cl") { |elt| cats.push elt.attributes['title'].to_s }
  return cats
end  

def main
  cats = getMeetupCategories
  sleep 1
  cats.each do |cat|
    puts "Category:#{cat}"
    pages = getPagesFromCategory(cat)
    sleep 1
    pages.each do |page|
      puts " #{page}"
      images = getImagesFromMeetup(page)
      sleep 1
      images.each do |image|
        puts "  #{image}"
        imageCats = getCategoriesFromImage(image)
        sleep 1
        if imageCats.member? "Category:#{cat}"
          puts "Found an image that _is_ categorized: #{image} in #{cat}"
        end
      end
    end
  end
end

main