aboutsummaryrefslogtreecommitdiff
path: root/scraper.py
diff options
context:
space:
mode:
authorGravatar Anshul Gupta <ansg191@anshulg.com> 2024-05-06 16:19:09 -0700
committerGravatar Anshul Gupta <ansg191@anshulg.com> 2024-05-06 16:20:48 -0700
commitd2e06f817f20d9534798f2fb35f65182514913c5 (patch)
treee9f0a4e52cb62e390576234c39cbc7a841b6271b /scraper.py
parentca2bbbaa4c39f7f490a934921436062e1e743889 (diff)
downloadCS172-Project-d2e06f817f20d9534798f2fb35f65182514913c5.tar.gz
CS172-Project-d2e06f817f20d9534798f2fb35f65182514913c5.tar.zst
CS172-Project-d2e06f817f20d9534798f2fb35f65182514913c5.zip
Add comments and configuration options
Diffstat (limited to 'scraper.py')
-rw-r--r--scraper.py36
1 files changed, 27 insertions, 9 deletions
diff --git a/scraper.py b/scraper.py
index 3e58d8e..c21c94e 100644
--- a/scraper.py
+++ b/scraper.py
@@ -5,39 +5,56 @@ from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider
import os
+# Maximum file size in GB before scraper is closed
+MAX_FILE_SIZE_GB = float(os.environ.get("SCRAPY_MAX_FILE_SIZE_GB", 0.5))
+# Maximum number of concurrent requests
+MAX_CONCURRENT_REQUESTS = int(os.environ.get("SCRAPY_MAX_CONCURRENT_REQUESTS", 8))
+# Maximum concurrent requests per domain
+MAX_REQUESTS_PER_DOMAIN = int(os.environ.get("SCRAPY_MAX_REQUESTS_PER_DOMAIN", 4))
+# Output CSV File
+OUTPUT_FILE = os.environ.get("SCRAPY_OUTPUT_FILE", "computerscience_data.csv")
+
class ComputerScienceSpyder(CrawlSpider):
+ """
+ Main Scrapy Spider to extract information about computer science pages.
+ """
+
name = 'computerscience_data'
allowed_domains = ['c-sharpcorner.com', 'wikipedia.org', 'javatpoint.com']
custom_settings = {
- 'CONCURRENT_REQUESTS': 8,
- 'CONCURRENT_REQUESTS_PER_DOMAIN': 4,
+ 'CONCURRENT_REQUESTS': MAX_CONCURRENT_REQUESTS,
+ 'CONCURRENT_REQUESTS_PER_DOMAIN': MAX_REQUESTS_PER_DOMAIN,
'DOWNLOAD_DELAY': 1.0,
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_START_DELAY': 2,
- 'AUTOTHROTTLE_TARGET_CONCURRENCY': 8,
+ 'AUTOTHROTTLE_TARGET_CONCURRENCY': MAX_CONCURRENT_REQUESTS,
'HTTPCACHE_ENABLED': True,
'HTTPCACHE_EXPIRATION_SECS': 1800,
}
def __init__(self, *args, **kwargs):
super(ComputerScienceSpyder, self).__init__(*args, **kwargs)
+
+ # Set to prevent duplicate pages
self.scraped_urls = set()
def parsing_data_func(self, result):
+ # Check if we've scraped this URL already
if result.url in self.scraped_urls:
return
+
self.scraped_urls.add(result.url)
+ # Extract page text
content = ' '.join(result.xpath('//p/text()').getall()).strip()
+ # Extract image source URLs
src_list = result.css('img::attr(src)').extract()
-
image_urls = []
for url in src_list:
full_url = result.urljoin(url)
-
image_urls.append(full_url)
yield {
@@ -48,16 +65,17 @@ class ComputerScienceSpyder(CrawlSpider):
'Image URLs': '|'.join(image_urls),
}
- file_path = 'computerscience_data.csv'
- file_size_bytes = os.path.getsize(file_path)
+ file_size_bytes = os.path.getsize(OUTPUT_FILE)
bytes_per_gigabyte = 1024 * 1024 * 1024
file_size_gigabytes = file_size_bytes / bytes_per_gigabyte
print(f"The file size is {file_size_gigabytes} GB")
- if file_size_gigabytes >= 0.5:
+ # Close if enough data is scraped
+ if file_size_gigabytes >= MAX_FILE_SIZE_GB:
raise CloseSpider("Done with Crawling")
rules = (
+ # Follow all links we encounter
Rule(LinkExtractor(allow=()), callback='parsing_data_func', follow=True),
)
@@ -66,7 +84,7 @@ process = CrawlerProcess(settings={
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
'ROBOTSTXT_OBEY': True,
'FEEDS': {
- 'computerscience_data.csv': {
+ OUTPUT_FILE: {
'format': 'csv',
'fields': ['Domain', 'Title', 'ParsedContent', 'ImageResourceLocator', 'ResourceLocator'],
},