diff options
author | 2024-05-06 16:00:22 -0700 | |
---|---|---|
committer | 2024-05-06 16:00:22 -0700 | |
commit | 2c08966397823ff9e909a88cfacff1aecea8ce2c (patch) | |
tree | 63210f64490371e769a281ef8d38192980379780 | |
parent | ccd3b56396d2766905f0f32d75d02f259a965251 (diff) | |
download | CS172-Project-2c08966397823ff9e909a88cfacff1aecea8ce2c.tar.gz CS172-Project-2c08966397823ff9e909a88cfacff1aecea8ce2c.tar.zst CS172-Project-2c08966397823ff9e909a88cfacff1aecea8ce2c.zip |
UpdatedCrawlerFile
-rw-r--r-- | CrawlingData.py | 98 |
1 files changed, 98 insertions, 0 deletions
diff --git a/CrawlingData.py b/CrawlingData.py new file mode 100644 index 0000000..36002e8 --- /dev/null +++ b/CrawlingData.py @@ -0,0 +1,98 @@ +import scrapy +from scrapy.crawler import CrawlerProcess +from scrapy.spiders import CrawlSpider, Rule +from scrapy.linkextractors import LinkExtractor +from scrapy.exceptions import CloseSpider +import os + +class ComputerScienceSpyder(CrawlSpider): + name = 'computerscience_data' + allowed_domains = ['c-sharpcorner.com', 'wikipedia.org', 'javatpoint.com'] + custom_settings = { + 'CONCURRENT_REQUESTS': 8, + 'CONCURRENT_REQUESTS_PER_DOMAIN': 4, + 'DOWNLOAD_DELAY': 1.0, + 'AUTOTHROTTLE_ENABLED': True, + 'AUTOTHROTTLE_START_DELAY': 2, + 'AUTOTHROTTLE_TARGET_CONCURRENCY': 8, + 'HTTPCACHE_ENABLED': True, + 'HTTPCACHE_EXPIRATION_SECS': 1800, + } + + def __init__(self, *args, **kwargs): + super(ComputerScienceSpyder, self).__init__(*args, **kwargs) + self.scraped_urls = set() + + def parsing_data_func(self, result): + if result.url in self.scraped_urls: + return + self.scraped_urls.add(result.url) + + content = ' '.join(result.xpath('//p/text()').getall()).strip() + + src_list = result.css('img::attr(src)').extract() + + + image_urls = [] + + for url in src_list: + + full_url = result.urljoin(url) + + + image_urls.append(full_url) + + yield { + 'Domain': result.url.split('/')[2], + 'URL': result.url, + 'Title': result.css('title::text').get(), + 'Content': content, + 'Image URLs': '|'.join(image_urls), + } + + + file_path = 'computerscience_data.csv' + file_size_bytes = os.path.getsize(file_path) + bytes_per_gigabyte = 1024 * 1024 * 1024 + file_size_gigabytes = file_size_bytes / bytes_per_gigabyte + print(f"The file size is {file_size_gigabytes} GB") + + if file_size_gigabytes >= 0.5: + raise CloseSpider("Done with Crawling") + + + rules = ( + Rule(LinkExtractor(allow=()), callback='parsing_data_func', follow=True), + ) + + +process = CrawlerProcess(settings={ + 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36', + 'ROBOTSTXT_OBEY': True, + 'FEEDS': { + 'computerscience_data.csv': { + 'format': 'csv', + 'fields': ['Domain', 'Title', 'ParsedContent', 'ImageResourceLocator', 'ResourceLocator'], + }, + }, +}) + +# Start the crawling process for each domain +process.crawl(ComputerScienceSpyder, start_urls=[ + 'https://www.javatpoint.com/javascript-tutorial' +'https://www.javatpoint.com/c-programming-language-tutorial' +'https://www.javatpoint.com/cloud-computing' +'https://www.javatpoint.com/ajax-tutorial' +'https://www.javatpoint.com/json-tutorial' +'https://en.wikipedia.org/wiki/BERT_(language_model)' +'https://en.wikipedia.org/wiki/Computer_vision' +'https://www.c-sharpcorner.com/interview-questions-by-technology/android-programming' +'https://www.c-sharpcorner.com/interview-questions-by-technology/dot_net_2015', +'https://www.c-sharpcorner.com/interview-questions-by-technology/android-programming', +'https://www.c-sharpcorner.com/interview-questions-by-technology/databases-and-dba', +'https://www.c-sharpcorner.com/interview-questions-by-technology/ios', +'https://en.wikipedia.org/wiki/C_Sharp_(programming_language)', +'https://en.wikipedia.org/wiki/C%2B%2B', +'https://en.wikipedia.org/wiki/U-Net', +]) +process.start() |