-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathexample.py
More file actions
105 lines (92 loc) · 4.1 KB
/
example.py
File metadata and controls
105 lines (92 loc) · 4.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from whoosh.fields import Schema, TEXT, ID, KEYWORD, DATETIME
from whoosh.analysis import StandardAnalyzer
from datetime import datetime, timedelta, timezone
from search_engine import SearchEngine
if __name__ == "__main__":
# Define some sample documents.
# Notice we include 'extra' fields that aren't specifically in the search schema.
# These will still be preserved because SearchEngine pickles the entire input dict.
docs = [
{
"id": "1",
"title": "First document",
"description": "This is the first document we've added in San Francisco!",
"tags": ["foo", "bar"],
"date": datetime.now(timezone.utc) - timedelta(days=0),
"extra": "kittens and cats",
},
{
"id": "2",
"title": "Second document",
"description": "The second one is even more interesting!",
"tags": ["alice"],
"date": datetime.now(timezone.utc) - timedelta(days=1),
"extra": "foals and horses",
},
{
"id": "3",
"title": "Third document",
"description": "The third one is less interesting!",
"tags": ["bob"],
"date": datetime.now(timezone.utc) - timedelta(days=2),
"extra": "bunny and rabbit",
},
]
# Define the search schema.
# - ID: Unindexed value that can be retrieved quickly.
# - TEXT: Indexed (searchable) text.
# - StandardAnalyzer: Tokenizes text using standard rules (e.g., splits on whitespace and punctuation).
# - KEYWORD: Space-separated keywords, indexed as a single unit or individually.
# - DATETIME: Indexed as a date for range queries.
schema = Schema(
id=ID(stored=True),
title=TEXT(stored=True),
description=TEXT(stored=True, analyzer=StandardAnalyzer(), spelling=True),
tags=KEYWORD(stored=True, lowercase=True, commas=True),
date=DATETIME(stored=True),
)
print("--- Schema Configuration ---")
for k, v in schema.items():
print(f"Field '{k}': {v.__class__.__name__}")
print(f"Total searchable fields: {schema.names()}")
print("-" * 28 + "\n")
# Initialize and index
engine = SearchEngine(schema)
engine.index_documents(docs)
print(f"Successfully indexed {engine.get_index_size()} documents.\n")
# demonstrate simple keyword queries
print("--- Keyword Searching ---")
# Whoosh is case-insensitive by default with most analyzers.
for q in ["first", "second", "THIRD", "bob", "alice"]:
print(f"\nQuery: '{q}'")
engine.search(q, limit=10, print_only=True)
# demonstrate phrase search and multifield search
print("\n--- Phrase and Multi-field Searching ---")
# 'san francisco' will match because it's in the description.
# 'kittens' will match because SearchEngine searches all fields (except 'raw').
# Note: 'kittens' is in the 'extra' field of doc 1.
# BUT WAIT: 'extra' isn't in the schema, so it won't be indexed for search!
# The 'pickle' trick allows us to SEE 'extra' in the results,
# but it doesn't make 'extra' searchable unless we add it to the schema.
for q in ["san francisco", "kittens"]:
print(f"\nQuery: '{q}'")
engine.search(q, limit=10, print_only=True)
# demonstrate complex boolean logic
print("\n--- Boolean Logic Searching ---")
q = "San Francisco OR (tags:alice AND description:interesting)"
print(f"Query: '{q}'")
engine.search(q, limit=10, print_only=True)
# demonstrate date range queries
print("\n--- Date Range Searching ---")
# Syntax: [YYYYMMDD TO YYYYMMDD]
today = datetime.now(timezone.utc).date()
yesterday = today - timedelta(days=1)
q = f"date:[{yesterday.strftime('%Y%m%d')} TO {today.strftime('%Y%m%d')}]"
print(f"Searching for documents from {yesterday} to {today}...")
print(f"Query: '{q}'")
engine.search(q, limit=10, print_only=True)
# demonstrate spelling suggestions
print("\n--- Spelling Suggestions ---")
q = "fransisco" # misspelled
print(f"Query: '{q}'")
engine.search(q, limit=5, print_only=True)