aboutsummaryrefslogtreecommitdiff
path: root/bridges/CraigslistBridge.php
blob: 9a2c135871915a51354737aeeacda91afda779a2 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
<?php

class CraigslistBridge extends BridgeAbstract
{
    const NAME = 'Craigslist Bridge';
    const URI = 'https://craigslist.org/';
    const DESCRIPTION = 'Returns craigslist search results';

    const PARAMETERS = [ [
        'region' => [
            'name' => 'Region',
            'title' => 'The subdomain before craigslist.org in the URL',
            'exampleValue' => 'sfbay',
            'required' => true
        ],
        'search' => [
            'name' => 'Search Query',
            'title' => 'Everything in the URL after /search/',
            'exampleValue' => 'sya?query=laptop',
            'required' => true
        ],
        'limit' => [
            'name' => 'Number of Posts',
            'type' => 'number',
            'title' => 'The maximum number of posts is 120. Use 0 for unlimited posts.',
            'defaultValue' => '25'
        ]
    ]];

    const TEST_DETECT_PARAMETERS = [
        'https://sfbay.craigslist.org/search/sya?query=laptop' => [
            'region' => 'sfbay', 'search' => 'sya?query=laptop'
        ],
        'https://newyork.craigslist.org/search/sss?query=32gb+flash+drive&bundleDuplicates=1&max_price=20' => [
            'region' => 'newyork', 'search' => 'sss?query=32gb+flash+drive&bundleDuplicates=1&max_price=20'
        ],
    ];

    const URL_REGEX = '/^https:\/\/(?<region>\w+).craigslist.org\/search\/(?<search>.+)/';

    public function detectParameters($url)
    {
        if (preg_match(self::URL_REGEX, $url, $matches)) {
            $params = [];
            $params['region'] = $matches['region'];
            $params['search'] = $matches['search'];
            return $params;
        }
    }

    public function getURI()
    {
        if (!is_null($this->getInput('region'))) {
            $domain = 'https://' . $this->getInput('region') . '.craigslist.org/search/';
            return urljoin($domain, $this->getInput('search'));
        }
        return parent::getURI();
    }

    public function collectData()
    {
        $uri = $this->getURI();
        $html = getSimpleHTMLDOM($uri);

        // Check if no results page is shown (nearby results)
        if (($html->find('.displaycountShow', 0)->plaintext ?? '') == '0') {
            return;
        }

        // Search for "more from nearby areas" banner in order to skip those results
        $results = $html->find('.result-row, h4.nearby');

        // Limit the number of posts
        if ($this->getInput('limit') > 0) {
            $results = array_slice($results, 0, $this->getInput('limit'));
        }

        foreach ($results as $post) {
            // Skip "nearby results" banner and results
            // This only appears when searchNearby is not specified
            if ($post->tag == 'h4') {
                break;
            }

            $item = [];

            $heading = $post->find('.result-heading a', 0);
            $item['uri'] = $heading->href;
            $item['title'] = $heading->plaintext;
            $item['timestamp'] = $post->find('.result-date', 0)->datetime;
            $item['uid'] = $heading->id;

            $price = $post->find('.result-price', 0)->plaintext ?? '';
            // Find the location (local and nearby results if searchNearby=1)
            $nearby = $post->find('.result-hood, span.nearby', 0)->plaintext ?? '';
            $item['content'] = sprintf('%s %s', $price, $nearby);

            $images = $post->find('.result-image[data-ids]', 0);
            if (!is_null($images)) {
                $item['content'] .= '<br>';
                foreach (explode(',', $images->getAttribute('data-ids')) as $image) {
                    // Remove leading 3: from each image id
                    $id = substr($image, 2);
                    $image_uri = 'https://images.craigslist.org/' . $id . '_300x300.jpg';
                    $item['content'] .= '<img src="' . $image_uri . '">';
                    $item['enclosures'][] = $image_uri;
                }
            }
            $this->items[] = $item;
        }
    }
}