diff options
author | 2024-06-07 14:47:49 -0700 | |
---|---|---|
committer | 2024-06-07 14:47:49 -0700 | |
commit | 6a03b939b4dc288f16775d415689ccfe5fe9e5aa (patch) | |
tree | fdc821826233072d05e664b5c66f1180a1b3a5cb | |
parent | befde6e530394248e6fa2f68bddea4a6b516119e (diff) | |
download | CS172-Project-6a03b939b4dc288f16775d415689ccfe5fe9e5aa.tar.gz CS172-Project-6a03b939b4dc288f16775d415689ccfe5fe9e5aa.tar.zst CS172-Project-6a03b939b4dc288f16775d415689ccfe5fe9e5aa.zip |
Fix bugs due to wierd websites
-rw-r--r-- | index.py | 29 | ||||
-rw-r--r-- | web/src/Doc.tsx | 4 |
2 files changed, 20 insertions, 13 deletions
@@ -2,7 +2,7 @@ import csv import sys import lucene -from java.nio.file import Path, Paths +from java.nio.file import Paths from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.document import Document, Field, TextField, StoredField from org.apache.lucene.index import IndexWriter, IndexWriterConfig, DirectoryReader @@ -16,6 +16,8 @@ CSV_PATH = "computerscience_data.csv" initialized = False +csv.field_size_limit(sys.maxsize) + class Index: def __init__(self): @@ -50,12 +52,16 @@ class Index: if self.has_data(): return + print("Building Index...") writerConfig = IndexWriterConfig(StandardAnalyzer()) writer = IndexWriter(self.dir, writerConfig) rows = self.read_csv() for i in range(1, len(rows)): + if i % 100 == 0: + print(f"Processing {i}/{len(rows)}") + doc = Document() doc.add(TextField("domain", rows[i][0], Field.Store.YES)) doc.add(TextField("title", rows[i][1], Field.Store.YES)) @@ -81,15 +87,16 @@ class Index: for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) - results.append({ - "score": hit.score, - "id": hit.doc, - "domain": doc.get("domain"), - "title": doc.get("title"), - "content": "", - "content": doc.get("content"), - "images": doc.get("images").split("|"), - "url": doc.get("url"), - }) + results.append( + { + "score": hit.score, + "id": hit.doc, + "domain": doc.get("domain"), + "title": doc.get("title"), + "content": doc.get("content"), + "images": doc.get("images").split("|"), + "url": doc.get("url"), + } + ) return results diff --git a/web/src/Doc.tsx b/web/src/Doc.tsx index 28db714..44d120e 100644 --- a/web/src/Doc.tsx +++ b/web/src/Doc.tsx @@ -22,9 +22,9 @@ const resultDocSchema = z.object({ id: z.number().int(), score: z.number(), domain: z.string(), - url: z.string().url(), + url: z.string(), title: z.string(), - images: z.string().url().array(), + images: z.string().array(), content: z.string(), }) |