CrawlingData.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider
import os

class ComputerScienceSpyder(CrawlSpider):
    name = 'computerscience_data'
    allowed_domains = ['c-sharpcorner.com', 'wikipedia.org', 'javatpoint.com']
    custom_settings = {
        'CONCURRENT_REQUESTS': 8,  
        'CONCURRENT_REQUESTS_PER_DOMAIN': 4,  
        'DOWNLOAD_DELAY': 1.0,  
        'AUTOTHROTTLE_ENABLED': True,  
        'AUTOTHROTTLE_START_DELAY': 2,  
        'AUTOTHROTTLE_TARGET_CONCURRENCY': 8, 
        'HTTPCACHE_ENABLED': True,  
        'HTTPCACHE_EXPIRATION_SECS': 1800,  
    }

    def __init__(self, *args, **kwargs):
        super(ComputerScienceSpyder, self).__init__(*args, **kwargs)
        self.scraped_urls = set()

    def parsing_data_func(self, result):
        if result.url in self.scraped_urls:
            return
        self.scraped_urls.add(result.url)

        content = ' '.join(result.xpath('//p/text()').getall()).strip()
       
        src_list = result.css('img::attr(src)').extract()

     
        image_urls = []
      
        for url in src_list:
    
            full_url = result.urljoin(url)
    
  
            image_urls.append(full_url)

        yield {
            'Domain': result.url.split('/')[2],
            'URL': result.url,
            'Title': result.css('title::text').get(),
            'Content': content,
            'Image URLs': '|'.join(image_urls),
        }

       
        file_path = 'computerscience_data.csv'
        file_size_bytes = os.path.getsize(file_path)
        bytes_per_gigabyte = 1024 * 1024 * 1024
        file_size_gigabytes = file_size_bytes / bytes_per_gigabyte
        print(f"The file size is {file_size_gigabytes} GB")

        if file_size_gigabytes >= 0.5:  
            raise CloseSpider("Done with Crawling")

    
    rules = (
        Rule(LinkExtractor(allow=()), callback='parsing_data_func', follow=True),
    )


process = CrawlerProcess(settings={
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
    'ROBOTSTXT_OBEY': True,
    'FEEDS': {
        'computerscience_data.csv': {
            'format': 'csv',
            'fields': ['Domain', 'Title', 'ParsedContent', 'ImageResourceLocator', 'ResourceLocator'],
        },
    },
})

# Start the crawling process for each domain
process.crawl(ComputerScienceSpyder, start_urls=[
 'https://www.javatpoint.com/javascript-tutorial'
'https://www.javatpoint.com/c-programming-language-tutorial'
'https://www.javatpoint.com/cloud-computing'
'https://www.javatpoint.com/ajax-tutorial'
'https://www.javatpoint.com/json-tutorial'
'https://en.wikipedia.org/wiki/BERT_(language_model)'
'https://en.wikipedia.org/wiki/Computer_vision'
'https://www.c-sharpcorner.com/interview-questions-by-technology/android-programming' 
'https://www.c-sharpcorner.com/interview-questions-by-technology/dot_net_2015',
'https://www.c-sharpcorner.com/interview-questions-by-technology/android-programming',
'https://www.c-sharpcorner.com/interview-questions-by-technology/databases-and-dba',
'https://www.c-sharpcorner.com/interview-questions-by-technology/ios',
'https://en.wikipedia.org/wiki/C_Sharp_(programming_language)',
'https://en.wikipedia.org/wiki/C%2B%2B',
'https://en.wikipedia.org/wiki/U-Net',
])
process.start()