aboutsummaryrefslogtreecommitdiff
path: root/index.py
diff options
context:
space:
mode:
Diffstat (limited to 'index.py')
-rw-r--r--index.py95
1 files changed, 95 insertions, 0 deletions
diff --git a/index.py b/index.py
new file mode 100644
index 0000000..de8fca3
--- /dev/null
+++ b/index.py
@@ -0,0 +1,95 @@
+import csv
+import sys
+import lucene
+
+from java.nio.file import Path, Paths
+from org.apache.lucene.analysis.standard import StandardAnalyzer
+from org.apache.lucene.document import Document, Field, TextField, StoredField
+from org.apache.lucene.index import IndexWriter, IndexWriterConfig, DirectoryReader
+from org.apache.lucene.store import NIOFSDirectory
+from org.apache.lucene.search import IndexSearcher
+from org.apache.lucene.queryparser.classic import QueryParser
+
+
+INDEX_PATH = "index/"
+CSV_PATH = "computerscience_data.csv"
+
+initialized = False
+
+
+class Index:
+ def __init__(self):
+ self.init()
+ self.analyzer = StandardAnalyzer()
+ self.dir = NIOFSDirectory(Paths.get(INDEX_PATH))
+
+ def init(self):
+ global initialized
+ if not initialized:
+ lucene.initVM()
+ initialized = True
+
+ def read_csv(self):
+ rows = []
+ with open(CSV_PATH) as csvfile:
+ reader = csv.reader(csvfile)
+ rows = [row for row in reader]
+ return rows
+
+ def has_data(self):
+ try:
+ reader = DirectoryReader.open(self.dir)
+ ret = reader.numDocs() > 0
+ reader.close()
+ return ret
+ except Exception as e:
+ return False
+
+ def build_index(self):
+ self.init()
+ if self.has_data():
+ return
+
+ writerConfig = IndexWriterConfig(StandardAnalyzer())
+ writer = IndexWriter(self.dir, writerConfig)
+
+ rows = self.read_csv()
+
+ for i in range(1, len(rows)):
+ doc = Document()
+ doc.add(TextField("domain", rows[i][0], Field.Store.YES))
+ doc.add(TextField("title", rows[i][1], Field.Store.YES))
+ doc.add(TextField("content", rows[i][2], Field.Store.YES))
+ doc.add(StoredField("images", rows[i][3]))
+ doc.add(StoredField("url", rows[i][4]))
+ writer.addDocument(doc)
+
+ writer.close()
+
+ def query(self, query):
+ # Attach current thread to JVM
+ lucene.getVMEnv().attachCurrentThread()
+
+ reader = DirectoryReader.open(self.dir)
+ print(reader.numDocs())
+ searcher = IndexSearcher(reader)
+ parser = QueryParser("content", self.analyzer)
+ q = parser.parse(query)
+ hits = searcher.search(q, 10)
+
+ results = []
+ for hit in hits.scoreDocs:
+ doc = searcher.doc(hit.doc)
+
+ results.append({
+ "score": hit.score,
+ "id": hit.doc,
+ "domain": doc.get("domain"),
+ "title": doc.get("title"),
+ "content": "",
+ "content": doc.get("content"),
+ "images": doc.get("images").split("|"),
+ "url": doc.get("url"),
+ })
+
+ return results