#!/usr/bin/ruby # # get, parse and enrich slashdot rss feed # # Change history # # 26.06.2009 erb stopped error from appearing if servers do not respond # require 'net/http' require 'uri' require 'open-uri' require 'timeout' require 'rexml/document' include REXML require 'hpricot' if ARGV.length > 0 print "usage: #{File::basename($0)}\n" exit end if ENV['http_proxy'].nil? && !ENV['HTTP_PROXY'].nil? ENV['http_proxy'] = ENV['HTTP_PROXY'] end feedurl = 'http://rss.slashdot.org/Slashdot/slashdot' feed_text = "" retries = 4 begin Timeout::timeout(15) do feed = open(feedurl) exit 1 if feed.nil? feed_text = feed.read end rescue Timeout::Error retries -= 1 exit 2 if retries < 1 sleep 1 retry end exit 3 if feed_text.length < 20 xml = Document.new(feed_text) xml.elements.each("//item") do |item| # extract link to article article_url = item.attributes.get_attribute('rdf:about').value article_url.sub!(%r{\?from=rss$}, "/") # get full text for article begin article = open(article_url) rescue next end next if article.nil? article_text="" begin article_xml = Hpricot(article) rescue next end # /. special: extract the two divisions: # first