aboutsummaryrefslogtreecommitdiff
path: root/CrawlingData.py
diff options
context:
space:
mode:
authorGravatar NikhilMahendrakar <nikhilmahendrakar00@gmail.com> 2024-05-06 16:00:22 -0700
committerGravatar NikhilMahendrakar <nikhilmahendrakar00@gmail.com> 2024-05-06 16:00:22 -0700
commit2c08966397823ff9e909a88cfacff1aecea8ce2c (patch)
tree63210f64490371e769a281ef8d38192980379780 /CrawlingData.py
parentccd3b56396d2766905f0f32d75d02f259a965251 (diff)
downloadCS172-Project-2c08966397823ff9e909a88cfacff1aecea8ce2c.tar.gz
CS172-Project-2c08966397823ff9e909a88cfacff1aecea8ce2c.tar.zst
CS172-Project-2c08966397823ff9e909a88cfacff1aecea8ce2c.zip
UpdatedCrawlerFile
Diffstat (limited to 'CrawlingData.py')
-rw-r--r--CrawlingData.py98
1 files changed, 98 insertions, 0 deletions
diff --git a/CrawlingData.py b/CrawlingData.py
new file mode 100644
index 0000000..36002e8
--- /dev/null
+++ b/CrawlingData.py
@@ -0,0 +1,98 @@
+import scrapy
+from scrapy.crawler import CrawlerProcess
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+from scrapy.exceptions import CloseSpider
+import os
+
+class ComputerScienceSpyder(CrawlSpider):
+ name = 'computerscience_data'
+ allowed_domains = ['c-sharpcorner.com', 'wikipedia.org', 'javatpoint.com']
+ custom_settings = {
+ 'CONCURRENT_REQUESTS': 8,
+ 'CONCURRENT_REQUESTS_PER_DOMAIN': 4,
+ 'DOWNLOAD_DELAY': 1.0,
+ 'AUTOTHROTTLE_ENABLED': True,
+ 'AUTOTHROTTLE_START_DELAY': 2,
+ 'AUTOTHROTTLE_TARGET_CONCURRENCY': 8,
+ 'HTTPCACHE_ENABLED': True,
+ 'HTTPCACHE_EXPIRATION_SECS': 1800,
+ }
+
+ def __init__(self, *args, **kwargs):
+ super(ComputerScienceSpyder, self).__init__(*args, **kwargs)
+ self.scraped_urls = set()
+
+ def parsing_data_func(self, result):
+ if result.url in self.scraped_urls:
+ return
+ self.scraped_urls.add(result.url)
+
+ content = ' '.join(result.xpath('//p/text()').getall()).strip()
+
+ src_list = result.css('img::attr(src)').extract()
+
+
+ image_urls = []
+
+ for url in src_list:
+
+ full_url = result.urljoin(url)
+
+
+ image_urls.append(full_url)
+
+ yield {
+ 'Domain': result.url.split('/')[2],
+ 'URL': result.url,
+ 'Title': result.css('title::text').get(),
+ 'Content': content,
+ 'Image URLs': '|'.join(image_urls),
+ }
+
+
+ file_path = 'computerscience_data.csv'
+ file_size_bytes = os.path.getsize(file_path)
+ bytes_per_gigabyte = 1024 * 1024 * 1024
+ file_size_gigabytes = file_size_bytes / bytes_per_gigabyte
+ print(f"The file size is {file_size_gigabytes} GB")
+
+ if file_size_gigabytes >= 0.5:
+ raise CloseSpider("Done with Crawling")
+
+
+ rules = (
+ Rule(LinkExtractor(allow=()), callback='parsing_data_func', follow=True),
+ )
+
+
+process = CrawlerProcess(settings={
+ 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
+ 'ROBOTSTXT_OBEY': True,
+ 'FEEDS': {
+ 'computerscience_data.csv': {
+ 'format': 'csv',
+ 'fields': ['Domain', 'Title', 'ParsedContent', 'ImageResourceLocator', 'ResourceLocator'],
+ },
+ },
+})
+
+# Start the crawling process for each domain
+process.crawl(ComputerScienceSpyder, start_urls=[
+ 'https://www.javatpoint.com/javascript-tutorial'
+'https://www.javatpoint.com/c-programming-language-tutorial'
+'https://www.javatpoint.com/cloud-computing'
+'https://www.javatpoint.com/ajax-tutorial'
+'https://www.javatpoint.com/json-tutorial'
+'https://en.wikipedia.org/wiki/BERT_(language_model)'
+'https://en.wikipedia.org/wiki/Computer_vision'
+'https://www.c-sharpcorner.com/interview-questions-by-technology/android-programming'
+'https://www.c-sharpcorner.com/interview-questions-by-technology/dot_net_2015',
+'https://www.c-sharpcorner.com/interview-questions-by-technology/android-programming',
+'https://www.c-sharpcorner.com/interview-questions-by-technology/databases-and-dba',
+'https://www.c-sharpcorner.com/interview-questions-by-technology/ios',
+'https://en.wikipedia.org/wiki/C_Sharp_(programming_language)',
+'https://en.wikipedia.org/wiki/C%2B%2B',
+'https://en.wikipedia.org/wiki/U-Net',
+])
+process.start()