1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
import csv
import sys
import lucene
from java.nio.file import Paths
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, TextField, StoredField
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, DirectoryReader
from org.apache.lucene.store import NIOFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.queryparser.classic import QueryParser
INDEX_PATH = "index/"
CSV_PATH = "computerscience_data.csv"
initialized = False
csv.field_size_limit(sys.maxsize)
class Index:
def __init__(self):
self.init()
self.analyzer = StandardAnalyzer()
self.dir = NIOFSDirectory(Paths.get(INDEX_PATH))
def init(self):
global initialized
if not initialized:
lucene.initVM()
initialized = True
def read_csv(self):
rows = []
with open(CSV_PATH) as csvfile:
reader = csv.reader(csvfile)
rows = [row for row in reader]
return rows
def has_data(self):
try:
reader = DirectoryReader.open(self.dir)
ret = reader.numDocs() > 0
reader.close()
return ret
except Exception as e:
return False
def build_index(self):
self.init()
if self.has_data():
return
print("Building Index...")
writerConfig = IndexWriterConfig(StandardAnalyzer())
writer = IndexWriter(self.dir, writerConfig)
rows = self.read_csv()
for i in range(1, len(rows)):
if i % 100 == 0:
print(f"Processing {i}/{len(rows)}")
doc = Document()
doc.add(TextField("domain", rows[i][0], Field.Store.YES))
doc.add(TextField("title", rows[i][1], Field.Store.YES))
doc.add(TextField("content", rows[i][2], Field.Store.YES))
doc.add(StoredField("images", rows[i][3]))
doc.add(StoredField("url", rows[i][4]))
writer.addDocument(doc)
print("Finished Indexing...")
writer.close()
def query(self, query):
# Attach current thread to JVM
lucene.getVMEnv().attachCurrentThread()
reader = DirectoryReader.open(self.dir)
searcher = IndexSearcher(reader)
parser = QueryParser("<default field>", self.analyzer)
new_query = "content:" + query + " OR title:" + query
q = parser.parse(new_query)
hits = searcher.search(q, 10)
results = []
for hit in hits.scoreDocs:
doc = searcher.doc(hit.doc)
results.append(
{
"score": hit.score,
"id": hit.doc,
"domain": doc.get("domain"),
"title": doc.get("title"),
"content": doc.get("content"),
"images": doc.get("images").split("|"),
"url": doc.get("url"),
}
)
lucene.getVMEnv().detachCurrentThread()
return results
|