-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtest_learning_engine.py
More file actions
169 lines (134 loc) Β· 6.02 KB
/
test_learning_engine.py
File metadata and controls
169 lines (134 loc) Β· 6.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""
Test the Learning Engine β self-improving extraction.
Tests:
1. First crawl β should learn pattern
2. Second crawl β should use learned pattern
3. Learning stats endpoint
4. Feedback endpoint
"""
import asyncio
import httpx
import os
from crawlkit.core.crawler import CrawlKit
from crawlkit.intelligence import PatternStorage, LearningEngine
async def test_learning_flow():
"""Test the complete learning flow: learn β apply β stats."""
print("\n" + "=" * 60)
print("π§ Testing CrawlKit Learning Engine")
print("=" * 60)
# Initialize learning engine with Supabase
SUPABASE_URL = os.getenv("SUPABASE_URL", "https://your-project.supabase.co")
SUPABASE_KEY = os.getenv("SUPABASE_SERVICE_KEY")
if not SUPABASE_KEY:
print("\nβ SUPABASE_SERVICE_KEY not set. Set it in environment.")
return
# Initialize storage and learning engine
print("\n1οΈβ£ Initializing learning engine...")
storage = PatternStorage(supabase_url=SUPABASE_URL, supabase_key=SUPABASE_KEY)
learning_engine = LearningEngine(storage=storage)
# Initialize crawler with learning engine
crawler = CrawlKit(learning_engine=learning_engine)
# Test URL β Vietnamese news site
test_url = "https://vnexpress.net/tham-nhung-chay-noi-cua-ong-cuu-thu-truong-bo-giao-thong-4855072.html"
print(f"\n2οΈβ£ First crawl β learning from: {test_url}")
result1 = await crawler.scrape(test_url, auto_extract=True)
if result1.error:
print(f" β Error: {result1.error}")
return
print(f" β
Success!")
print(f" Parser used: {result1.parser_used}")
print(f" Title: {result1.title}")
print(f" Content length: {len(result1.markdown)} chars")
print(f" Content type: {result1.content_type}")
# Check if pattern was learned
print("\n3οΈβ£ Checking if pattern was learned...")
domain = "vnexpress.net"
patterns = storage.get_patterns(domain)
if patterns:
print(f" β
Learned {len(patterns)} pattern(s) for {domain}")
for p in patterns:
print(f" β’ URL pattern: {p.url_pattern}")
print(f" β’ Quality score: {p.quality_score:.2f}")
print(f" β’ Content selectors: {p.content_selectors[:2]}")
else:
print(f" β οΈ No patterns learned yet (quality may be too low)")
# Second crawl β should use learned pattern
test_url2 = "https://vnexpress.net/di-xe-may-nuoc-ngoai-ve-viet-nam-phai-lam-gi-4854925.html"
print(f"\n4οΈβ£ Second crawl β should use learned pattern: {test_url2}")
result2 = await crawler.scrape(test_url2, auto_extract=True)
if result2.error:
print(f" β Error: {result2.error}")
else:
print(f" β
Success!")
print(f" Parser used: {result2.parser_used}")
print(f" Used learned pattern: {'learned:' in result2.parser_used}")
print(f" Title: {result2.title}")
print(f" Content length: {len(result2.markdown)} chars")
# Get learning stats
print("\n5οΈβ£ Learning engine statistics:")
stats = storage.get_stats()
print(f" Storage: {stats.get('storage', 'unknown')}")
print(f" Cache size: {stats.get('cache_size', 0)} / {stats.get('cache_maxsize', 0)}")
print(f" Total patterns: {stats.get('total_patterns', 0)}")
print(f" Unique domains: {stats.get('unique_domains', 0)}")
print(f" Avg quality: {stats.get('avg_quality', 0):.3f}")
if stats.get('top_domains'):
print("\n Top domains:")
for domain in stats['top_domains'][:5]:
print(f" β’ {domain.get('domain', 'N/A')}: {domain.get('total_crawls', 0)} crawls, quality {domain.get('avg_quality_score', 0):.2f}")
print("\n" + "=" * 60)
print("β
Learning engine test complete!")
print("=" * 60)
async def test_api_endpoints():
"""Test learning API endpoints."""
print("\n" + "=" * 60)
print("π Testing Learning API Endpoints")
print("=" * 60)
base_url = os.getenv("CRAWLKIT_API_URL", "http://localhost:8080")
master_key = os.getenv("CRAWLKIT_MASTER_KEY", "ck_master_dev")
headers = {"Authorization": f"Bearer {master_key}"}
async with httpx.AsyncClient(timeout=30.0) as client:
# Test learning stats endpoint
print("\n1οΈβ£ Testing GET /v1/admin/learning/stats")
try:
resp = await client.get(f"{base_url}/v1/admin/learning/stats", headers=headers)
print(f" Status: {resp.status_code}")
if resp.status_code == 200:
data = resp.json()
print(f" β
Success:")
print(f" {data}")
else:
print(f" β Error: {resp.text}")
except Exception as e:
print(f" β Exception: {e}")
# Test feedback endpoint
print("\n2οΈβ£ Testing POST /v1/feedback")
try:
feedback = {
"url": "https://vnexpress.net/test-article.html",
"feedback": "good",
"details": "Extraction was perfect!",
}
resp = await client.post(f"{base_url}/v1/feedback", json=feedback, headers=headers)
print(f" Status: {resp.status_code}")
if resp.status_code == 200:
data = resp.json()
print(f" β
Success: {data.get('message', 'OK')}")
else:
print(f" β Error: {resp.text}")
except Exception as e:
print(f" β Exception: {e}")
print("\n" + "=" * 60)
print("β
API endpoint tests complete!")
print("=" * 60)
async def main():
"""Run all tests."""
# Test 1: Direct learning engine
await test_learning_flow()
# Test 2: API endpoints (only if server is running)
try:
await test_api_endpoints()
except Exception as e:
print(f"\nβ οΈ API endpoint tests skipped (server not running): {e}")
if __name__ == "__main__":
asyncio.run(main())