index.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106

import csv
import sys
import lucene

from java.nio.file import Paths
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, TextField, StoredField
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, DirectoryReader
from org.apache.lucene.store import NIOFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.queryparser.classic import QueryParser


INDEX_PATH = "index/"
CSV_PATH = "computerscience_data.csv"

initialized = False

csv.field_size_limit(sys.maxsize)


class Index:
    def __init__(self):
        self.init()
        self.analyzer = StandardAnalyzer()
        self.dir = NIOFSDirectory(Paths.get(INDEX_PATH))

    def init(self):
        global initialized
        if not initialized:
            lucene.initVM()
            initialized = True

    def read_csv(self):
        rows = []
        with open(CSV_PATH) as csvfile:
            reader = csv.reader(csvfile)
            rows = [row for row in reader]
        return rows

    def has_data(self):
        try:
            reader = DirectoryReader.open(self.dir)
            ret = reader.numDocs() > 0
            reader.close()
            return ret
        except Exception as e:
            return False

    def build_index(self):
        self.init()
        if self.has_data():
            return

        print("Building Index...")
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        writer = IndexWriter(self.dir, writerConfig)

        rows = self.read_csv()

        for i in range(1, len(rows)):
            if i % 100 == 0:
                print(f"Processing {i}/{len(rows)}")

            doc = Document()
            doc.add(TextField("domain", rows[i][0], Field.Store.YES))
            doc.add(TextField("title", rows[i][1], Field.Store.YES))
            doc.add(TextField("content", rows[i][2], Field.Store.YES))
            doc.add(StoredField("images", rows[i][3]))
            doc.add(StoredField("url", rows[i][4]))
            writer.addDocument(doc)

        print("Finished Indexing...")
        writer.close()

    def query(self, query):
        # Attach current thread to JVM
        lucene.getVMEnv().attachCurrentThread()

        reader = DirectoryReader.open(self.dir)
        searcher = IndexSearcher(reader)

        parser = QueryParser("<default field>", self.analyzer)

        new_query = "content:" + query + " OR title:" + query
        q = parser.parse(new_query)
        hits = searcher.search(q, 10)

        results = []
        for hit in hits.scoreDocs:
            doc = searcher.doc(hit.doc)

            results.append(
                {
                    "score": hit.score,
                    "id": hit.doc,
                    "domain": doc.get("domain"),
                    "title": doc.get("title"),
                    "content": doc.get("content"),
                    "images": doc.get("images").split("|"),
                    "url": doc.get("url"),
                }
            )

        lucene.getVMEnv().detachCurrentThread()
        return results