blob: 6b49980d6d59a20661a8a0a023fd46d891b77141 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
#!/usr/bin/ruby
#
# get, parse and enrich heise rss feeds
#
# call with the feed specified you like to retrieve. Currently supported:
#
# news - heise newsticker
# teleopils - Telepolis
# security - heise security news
#
# Change history
#
# 26.06.2009 erb suppressed error messages due to unrepsonsive servers
#
require 'net/http'
require 'uri'
require 'rexml/document'
include REXML
require 'hpricot'
require "open-uri"
require 'timeout'
#try to retrieve web site, following up to 5 redirects
def geturl(url, depth=5)
raise ArgumentError, 'Followed more 4 redirections. Stopping this nightmare now.' if depth == 0
response = Net::HTTP.get_response(URI.parse(url))
case response
when Net::HTTPSuccess then response.body
when Net::HTTPRedirection then geturl(response['location'], depth-1) # follow redirection
else
# any other error shall not make any noise (maybe shall we produce a fake RSS item)
""
end
end
if ENV['http_proxy'].nil? && !ENV['HTTP_PROXY'].nil?
ENV['http_proxy'] = ENV['HTTP_PROXY']
end
feedurl="http://www.f1sa.com/index2.php?option=com_rss&feed=RSS2.0&no_html=1"
# get feed
feed_text = ""
retries=4
begin
Timeout::timeout(15) do
f = open(feedurl)
feed_text = f.read unless f.nil?
end
rescue Timeout::Error
retries -= 1
exit 1 if retries < 1
sleep 1
retry
rescue
# any other error shall not make any noise (maybe shall we produce a fake RSS item)
end
exit 2 if feed_text.length < 20
#print "Got this feed: ", feed_text, "\n"; STDOUT.flush
xml = Document.new(feed_text)
#loop over items
xml.elements.each("//item") do |item|
# extract link to article
article_url = item.elements['link'].text
# get full text for article
begin
article = open(article_url)
rescue
next
end
next if article.nil?
article_text=""
begin
article_xml = Hpricot(article)
rescue
next
end
#puts "Got article from #{article_url}"
# F1SA special: extract the division:
# first <div id="body_outer">
# and combine them in that order
article_xml.search("//div[@id]").each do |divitem|
if divitem.attributes['id'] == "body_outer"
article_text = "<div>"
article_text << divitem.inner_html << "</div>"
break
end
end
article_text.gsub!(/<!-- START of joscomment -->.*\Z/m, "")
#puts "Got this text: #{article_text}"
# get rid of comments and other annoying artifacts
article_text.gsub!(/<!--[^>]*-->/, "")
article_text.gsub!(/\s+/m, " ")
next if article_text.length < 10
# insert full text article into feed
item.delete_element("description")
description = Element.new("description")
description.text= CData.new(article_text)
item.add_element(description)
guid = Element.new("guid")
guid.text= article_url
item.add_element(guid)
end
#reproduce enriched feed
xml.write($stdout, -1)
|