aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Anshul Gupta <ansg191@anshulg.com> 2024-06-07 14:47:49 -0700
committerGravatar Anshul Gupta <ansg191@anshulg.com> 2024-06-07 14:47:49 -0700
commit6a03b939b4dc288f16775d415689ccfe5fe9e5aa (patch)
treefdc821826233072d05e664b5c66f1180a1b3a5cb
parentbefde6e530394248e6fa2f68bddea4a6b516119e (diff)
downloadCS172-Project-6a03b939b4dc288f16775d415689ccfe5fe9e5aa.tar.gz
CS172-Project-6a03b939b4dc288f16775d415689ccfe5fe9e5aa.tar.zst
CS172-Project-6a03b939b4dc288f16775d415689ccfe5fe9e5aa.zip
Fix bugs due to wierd websites
-rw-r--r--index.py29
-rw-r--r--web/src/Doc.tsx4
2 files changed, 20 insertions, 13 deletions
diff --git a/index.py b/index.py
index de8fca3..96b7e9d 100644
--- a/index.py
+++ b/index.py
@@ -2,7 +2,7 @@ import csv
import sys
import lucene
-from java.nio.file import Path, Paths
+from java.nio.file import Paths
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, TextField, StoredField
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, DirectoryReader
@@ -16,6 +16,8 @@ CSV_PATH = "computerscience_data.csv"
initialized = False
+csv.field_size_limit(sys.maxsize)
+
class Index:
def __init__(self):
@@ -50,12 +52,16 @@ class Index:
if self.has_data():
return
+ print("Building Index...")
writerConfig = IndexWriterConfig(StandardAnalyzer())
writer = IndexWriter(self.dir, writerConfig)
rows = self.read_csv()
for i in range(1, len(rows)):
+ if i % 100 == 0:
+ print(f"Processing {i}/{len(rows)}")
+
doc = Document()
doc.add(TextField("domain", rows[i][0], Field.Store.YES))
doc.add(TextField("title", rows[i][1], Field.Store.YES))
@@ -81,15 +87,16 @@ class Index:
for hit in hits.scoreDocs:
doc = searcher.doc(hit.doc)
- results.append({
- "score": hit.score,
- "id": hit.doc,
- "domain": doc.get("domain"),
- "title": doc.get("title"),
- "content": "",
- "content": doc.get("content"),
- "images": doc.get("images").split("|"),
- "url": doc.get("url"),
- })
+ results.append(
+ {
+ "score": hit.score,
+ "id": hit.doc,
+ "domain": doc.get("domain"),
+ "title": doc.get("title"),
+ "content": doc.get("content"),
+ "images": doc.get("images").split("|"),
+ "url": doc.get("url"),
+ }
+ )
return results
diff --git a/web/src/Doc.tsx b/web/src/Doc.tsx
index 28db714..44d120e 100644
--- a/web/src/Doc.tsx
+++ b/web/src/Doc.tsx
@@ -22,9 +22,9 @@ const resultDocSchema = z.object({
id: z.number().int(),
score: z.number(),
domain: z.string(),
- url: z.string().url(),
+ url: z.string(),
title: z.string(),
- images: z.string().url().array(),
+ images: z.string().array(),
content: z.string(),
})