aboutsummaryrefslogtreecommitdiff
path: root/backend/internal/ibd/transport/scrapfly/scrapfly.go
blob: 3b414defe75ee74abacccae74b00c07e30c819e2 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
package scrapfly

import (
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"net/url"
	"strconv"

	"github.com/ansg191/ibd-trader-backend/internal/ibd/transport"
)

type ScrapflyTransport struct {
	client  *http.Client
	apiKey  string
	options ScrapeOptions
}

func New(client *http.Client, apiKey string, opts ...ScrapeOption) *ScrapflyTransport {
	options := defaultScrapeOptions
	for _, opt := range opts {
		opt(&options)
	}

	return &ScrapflyTransport{
		client:  client,
		apiKey:  apiKey,
		options: options,
	}
}

func (s *ScrapflyTransport) String() string {
	return "scrapfly"
}

func (s *ScrapflyTransport) Do(req *http.Request) (*http.Response, error) {
	// Construct scrape request URL
	scrapeUrl, err := url.Parse(s.options.baseURL)
	if err != nil {
		panic(err)
	}
	scrapeUrl.RawQuery = s.constructRawQuery(req.URL, req.Header)

	// We can't handle `Content-Type` header on GET requests
	// Wierd quirk of the Scrapfly API
	if req.Method == http.MethodGet && req.Header.Get("Content-Type") != "" {
		return nil, transport.ErrUnsupportedRequest
	}

	// Construct scrape request
	scrapeReq, err := http.NewRequestWithContext(req.Context(), req.Method, scrapeUrl.String(), req.Body)
	if err != nil {
		return nil, err
	}

	// Send scrape request
	resp, err := s.client.Do(scrapeReq)
	if err != nil {
		return nil, err
	}
	defer func(Body io.ReadCloser) {
		_ = Body.Close()
	}(resp.Body)

	// Parse scrape response
	scraperResponse := new(ScraperResponse)
	err = json.NewDecoder(resp.Body).Decode(scraperResponse)
	if err != nil {
		return nil, err
	}

	// Convert scraper response to http.Response
	return scraperResponse.ToHTTPResponse()
}

func (s *ScrapflyTransport) Properties() transport.Properties {
	return transport.PropertiesReliable
}

func (s *ScrapflyTransport) constructRawQuery(u *url.URL, headers http.Header) string {
	params := url.Values{}
	params.Set("key", s.apiKey)
	params.Set("url", u.String())
	if s.options.country != nil {
		params.Set("country", *s.options.country)
	}
	params.Set("asp", strconv.FormatBool(s.options.asp))
	params.Set("proxy_pool", s.options.proxyPool.String())
	params.Set("render_js", strconv.FormatBool(s.options.renderJS))
	params.Set("cache", strconv.FormatBool(s.options.cache))

	for k, v := range headers {
		for i, vv := range v {
			params.Add(
				fmt.Sprintf("headers[%s][%d]", k, i),
				vv,
			)
		}
	}

	return params.Encode()
}