import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider
import os

# Maximum file size in GB before scraper is closed
MAX_FILE_SIZE_GB = float(os.environ.get("SCRAPY_MAX_FILE_SIZE_GB", 0.5))
# Maximum number of concurrent requests
MAX_CONCURRENT_REQUESTS = int(os.environ.get("SCRAPY_MAX_CONCURRENT_REQUESTS", 8))
# Maximum concurrent requests per domain
MAX_REQUESTS_PER_DOMAIN = int(os.environ.get("SCRAPY_MAX_REQUESTS_PER_DOMAIN", 4))
# Output CSV File
OUTPUT_FILE = os.environ.get("SCRAPY_OUTPUT_FILE", "computerscience_data.csv")


class ComputerScienceSpyder(CrawlSpider):
    """
    Main Scrapy Spider to extract information about computer science pages.
    """

    name = 'computerscience_data'
    allowed_domains = ['c-sharpcorner.com', 'wikipedia.org', 'javatpoint.com']
    custom_settings = {
        'CONCURRENT_REQUESTS': MAX_CONCURRENT_REQUESTS,
        'CONCURRENT_REQUESTS_PER_DOMAIN': MAX_REQUESTS_PER_DOMAIN,
        'DOWNLOAD_DELAY': 1.0,
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_START_DELAY': 2,
        'AUTOTHROTTLE_TARGET_CONCURRENCY': MAX_CONCURRENT_REQUESTS,
        'HTTPCACHE_ENABLED': True,
        'HTTPCACHE_EXPIRATION_SECS': 1800,
    }

    def __init__(self, *args, **kwargs):
        super(ComputerScienceSpyder, self).__init__(*args, **kwargs)

        # Set to prevent duplicate pages
        self.scraped_urls = set()

    def parsing_data_func(self, result):
        # Check if we've scraped this URL already
        if result.url in self.scraped_urls:
            return

        self.scraped_urls.add(result.url)

        # Extract page text
        content = ' '.join(result.xpath('//p/text()').getall()).strip()

        # Extract image source URLs
        src_list = result.css('img::attr(src)').extract()
        image_urls = []

        for url in src_list:
            full_url = result.urljoin(url)
            image_urls.append(full_url)

        yield {
            'Domain': result.url.split('/')[2],
            'URL': result.url,
            'Title': result.css('title::text').get(),
            'Content': content,
            'Image URLs': '|'.join(image_urls),
        }

        file_size_bytes = os.path.getsize(OUTPUT_FILE)
        bytes_per_gigabyte = 1024 * 1024 * 1024
        file_size_gigabytes = file_size_bytes / bytes_per_gigabyte
        print(f"The file size is {file_size_gigabytes} GB")

        # Close if enough data is scraped
        if file_size_gigabytes >= MAX_FILE_SIZE_GB:
            raise CloseSpider("Done with Crawling")

    rules = (
        # Follow all links we encounter
        Rule(LinkExtractor(allow=()), callback='parsing_data_func', follow=True),
    )


process = CrawlerProcess(settings={
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
    'ROBOTSTXT_OBEY': True,
    'FEEDS': {
        OUTPUT_FILE: {
            'format': 'csv',
            'fields': ['Domain', 'Title', 'ParsedContent', 'ImageResourceLocator', 'ResourceLocator'],
        },
    },
})

# Start the crawling process for each domain
process.crawl(ComputerScienceSpyder, start_urls=[
    'https://www.javatpoint.com/javascript-tutorial'
    'https://www.javatpoint.com/c-programming-language-tutorial'
    'https://www.javatpoint.com/cloud-computing'
    'https://www.javatpoint.com/ajax-tutorial'
    'https://www.javatpoint.com/json-tutorial'
    'https://en.wikipedia.org/wiki/BERT_(language_model)'
    'https://en.wikipedia.org/wiki/Computer_vision'
    'https://www.c-sharpcorner.com/interview-questions-by-technology/android-programming'
    'https://www.c-sharpcorner.com/interview-questions-by-technology/dot_net_2015',
    'https://www.c-sharpcorner.com/interview-questions-by-technology/android-programming',
    'https://www.c-sharpcorner.com/interview-questions-by-technology/databases-and-dba',
    'https://www.c-sharpcorner.com/interview-questions-by-technology/ios',
    'https://en.wikipedia.org/wiki/C_Sharp_(programming_language)',
    'https://en.wikipedia.org/wiki/C%2B%2B',
    'https://en.wikipedia.org/wiki/U-Net',
])
process.start()