diff options
author | 2024-05-06 16:19:09 -0700 | |
---|---|---|
committer | 2024-05-06 16:20:48 -0700 | |
commit | d2e06f817f20d9534798f2fb35f65182514913c5 (patch) | |
tree | e9f0a4e52cb62e390576234c39cbc7a841b6271b /scraper.py | |
parent | ca2bbbaa4c39f7f490a934921436062e1e743889 (diff) | |
download | CS172-Project-d2e06f817f20d9534798f2fb35f65182514913c5.tar.gz CS172-Project-d2e06f817f20d9534798f2fb35f65182514913c5.tar.zst CS172-Project-d2e06f817f20d9534798f2fb35f65182514913c5.zip |
Add comments and configuration options
Diffstat (limited to 'scraper.py')
-rw-r--r-- | scraper.py | 36 |
1 files changed, 27 insertions, 9 deletions
@@ -5,39 +5,56 @@ from scrapy.linkextractors import LinkExtractor from scrapy.exceptions import CloseSpider import os +# Maximum file size in GB before scraper is closed +MAX_FILE_SIZE_GB = float(os.environ.get("SCRAPY_MAX_FILE_SIZE_GB", 0.5)) +# Maximum number of concurrent requests +MAX_CONCURRENT_REQUESTS = int(os.environ.get("SCRAPY_MAX_CONCURRENT_REQUESTS", 8)) +# Maximum concurrent requests per domain +MAX_REQUESTS_PER_DOMAIN = int(os.environ.get("SCRAPY_MAX_REQUESTS_PER_DOMAIN", 4)) +# Output CSV File +OUTPUT_FILE = os.environ.get("SCRAPY_OUTPUT_FILE", "computerscience_data.csv") + class ComputerScienceSpyder(CrawlSpider): + """ + Main Scrapy Spider to extract information about computer science pages. + """ + name = 'computerscience_data' allowed_domains = ['c-sharpcorner.com', 'wikipedia.org', 'javatpoint.com'] custom_settings = { - 'CONCURRENT_REQUESTS': 8, - 'CONCURRENT_REQUESTS_PER_DOMAIN': 4, + 'CONCURRENT_REQUESTS': MAX_CONCURRENT_REQUESTS, + 'CONCURRENT_REQUESTS_PER_DOMAIN': MAX_REQUESTS_PER_DOMAIN, 'DOWNLOAD_DELAY': 1.0, 'AUTOTHROTTLE_ENABLED': True, 'AUTOTHROTTLE_START_DELAY': 2, - 'AUTOTHROTTLE_TARGET_CONCURRENCY': 8, + 'AUTOTHROTTLE_TARGET_CONCURRENCY': MAX_CONCURRENT_REQUESTS, 'HTTPCACHE_ENABLED': True, 'HTTPCACHE_EXPIRATION_SECS': 1800, } def __init__(self, *args, **kwargs): super(ComputerScienceSpyder, self).__init__(*args, **kwargs) + + # Set to prevent duplicate pages self.scraped_urls = set() def parsing_data_func(self, result): + # Check if we've scraped this URL already if result.url in self.scraped_urls: return + self.scraped_urls.add(result.url) + # Extract page text content = ' '.join(result.xpath('//p/text()').getall()).strip() + # Extract image source URLs src_list = result.css('img::attr(src)').extract() - image_urls = [] for url in src_list: full_url = result.urljoin(url) - image_urls.append(full_url) yield { @@ -48,16 +65,17 @@ class ComputerScienceSpyder(CrawlSpider): 'Image URLs': '|'.join(image_urls), } - file_path = 'computerscience_data.csv' - file_size_bytes = os.path.getsize(file_path) + file_size_bytes = os.path.getsize(OUTPUT_FILE) bytes_per_gigabyte = 1024 * 1024 * 1024 file_size_gigabytes = file_size_bytes / bytes_per_gigabyte print(f"The file size is {file_size_gigabytes} GB") - if file_size_gigabytes >= 0.5: + # Close if enough data is scraped + if file_size_gigabytes >= MAX_FILE_SIZE_GB: raise CloseSpider("Done with Crawling") rules = ( + # Follow all links we encounter Rule(LinkExtractor(allow=()), callback='parsing_data_func', follow=True), ) @@ -66,7 +84,7 @@ process = CrawlerProcess(settings={ 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36', 'ROBOTSTXT_OBEY': True, 'FEEDS': { - 'computerscience_data.csv': { + OUTPUT_FILE: { 'format': 'csv', 'fields': ['Domain', 'Title', 'ParsedContent', 'ImageResourceLocator', 'ResourceLocator'], }, |