scraper.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider
import os

# Maximum file size in GB before scraper is closed
MAX_FILE_SIZE_GB = float(os.environ.get("SCRAPY_MAX_FILE_SIZE_GB", 0.5))
# Maximum number of concurrent requests
MAX_CONCURRENT_REQUESTS = int(os.environ.get("SCRAPY_MAX_CONCURRENT_REQUESTS", 8))
# Maximum concurrent requests per domain
MAX_REQUESTS_PER_DOMAIN = int(os.environ.get("SCRAPY_MAX_REQUESTS_PER_DOMAIN", 4))
# Output CSV File
OUTPUT_FILE = os.environ.get("SCRAPY_OUTPUT_FILE", "computerscience_data.csv")


class ComputerScienceSpyder(CrawlSpider):
    """
    Main Scrapy Spider to extract information about computer science pages.
    """

    name = 'computerscience_data'
    allowed_domains = ['c-sharpcorner.com', 'wikipedia.org', 'javatpoint.com']
    custom_settings = {
        'CONCURRENT_REQUESTS': MAX_CONCURRENT_REQUESTS,
        'CONCURRENT_REQUESTS_PER_DOMAIN': MAX_REQUESTS_PER_DOMAIN,
        'DOWNLOAD_DELAY': 1.0,
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_START_DELAY': 2,
        'AUTOTHROTTLE_TARGET_CONCURRENCY': MAX_CONCURRENT_REQUESTS,
        'HTTPCACHE_ENABLED': True,
        'HTTPCACHE_EXPIRATION_SECS': 1800,
    }

    def __init__(self, *args, **kwargs):
        super(ComputerScienceSpyder, self).__init__(*args, **kwargs)

        # Set to prevent duplicate pages
        self.scraped_urls = set()

    def parsing_data_func(self, result):
        # Check if we've scraped this URL already
        if result.url in self.scraped_urls:
            return

        self.scraped_urls.add(result.url)

        # Extract page text
        content = ' '.join(result.xpath('//p/text()').getall()).strip()

        # Extract image source URLs
        src_list = result.css('img::attr(src)').extract()
        image_urls = []

        for url in src_list:
            full_url = result.urljoin(url)
            image_urls.append(full_url)

        yield {
            'Domain': result.url.split('/')[2],
            'URL': result.url,
            'Title': result.css('title::text').get(),
            'Content': content,
            'Image URLs': '|'.join(image_urls),
        }

        file_size_bytes = os.path.getsize(OUTPUT_FILE)
        bytes_per_gigabyte = 1024 * 1024 * 1024
        file_size_gigabytes = file_size_bytes / bytes_per_gigabyte
        print(f"The file size is {file_size_gigabytes} GB")

        # Close if enough data is scraped
        if file_size_gigabytes >= MAX_FILE_SIZE_GB:
            raise CloseSpider("Done with Crawling")

    rules = (
        # Follow all links we encounter
        Rule(LinkExtractor(allow=()), callback='parsing_data_func', follow=True),
    )


process = CrawlerProcess(settings={
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
    'ROBOTSTXT_OBEY': True,
    'FEEDS': {
        OUTPUT_FILE: {
            'format': 'csv',
            'fields': ['Domain', 'Title', 'ParsedContent', 'ImageResourceLocator', 'ResourceLocator'],
        },
    },
})

# Start the crawling process for each domain
process.crawl(ComputerScienceSpyder, start_urls=[
    'https://www.javatpoint.com/javascript-tutorial'
    'https://www.javatpoint.com/c-programming-language-tutorial'
    'https://www.javatpoint.com/cloud-computing'
    'https://www.javatpoint.com/ajax-tutorial'
    'https://www.javatpoint.com/json-tutorial'
    'https://en.wikipedia.org/wiki/BERT_(language_model)'
    'https://en.wikipedia.org/wiki/Computer_vision'
    'https://www.c-sharpcorner.com/interview-questions-by-technology/android-programming'
    'https://www.c-sharpcorner.com/interview-questions-by-technology/dot_net_2015',
    'https://www.c-sharpcorner.com/interview-questions-by-technology/android-programming',
    'https://www.c-sharpcorner.com/interview-questions-by-technology/databases-and-dba',
    'https://www.c-sharpcorner.com/interview-questions-by-technology/ios',
    'https://en.wikipedia.org/wiki/C_Sharp_(programming_language)',
    'https://en.wikipedia.org/wiki/C%2B%2B',
    'https://en.wikipedia.org/wiki/U-Net',
])
process.start()