crawlkit/test_learning_engine.py at main · Paparusi/crawlkit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""
Test the Learning Engine — self-improving extraction.

Tests:
1. First crawl — should learn pattern
2. Second crawl — should use learned pattern
3. Learning stats endpoint
4. Feedback endpoint
"""

import asyncio
import httpx
import os
from crawlkit.core.crawler import CrawlKit
from crawlkit.intelligence import PatternStorage, LearningEngine


async def test_learning_flow():
    """Test the complete learning flow: learn → apply → stats."""
    print("\n" + "=" * 60)
    print("🧠 Testing CrawlKit Learning Engine")
    print("=" * 60)

    # Initialize learning engine with Supabase
    SUPABASE_URL = os.getenv("SUPABASE_URL", "https://your-project.supabase.co")
    SUPABASE_KEY = os.getenv("SUPABASE_SERVICE_KEY")

    if not SUPABASE_KEY:
        print("\n❌ SUPABASE_SERVICE_KEY not set. Set it in environment.")
        return

    # Initialize storage and learning engine
    print("\n1️⃣ Initializing learning engine...")
    storage = PatternStorage(supabase_url=SUPABASE_URL, supabase_key=SUPABASE_KEY)
    learning_engine = LearningEngine(storage=storage)

    # Initialize crawler with learning engine
    crawler = CrawlKit(learning_engine=learning_engine)

    # Test URL — Vietnamese news site
    test_url = "https://vnexpress.net/tham-nhung-chay-noi-cua-ong-cuu-thu-truong-bo-giao-thong-4855072.html"

    print(f"\n2️⃣ First crawl — learning from: {test_url}")
    result1 = await crawler.scrape(test_url, auto_extract=True)

    if result1.error:
        print(f"   ❌ Error: {result1.error}")
        return

    print(f"   ✅ Success!")
    print(f"   Parser used: {result1.parser_used}")
    print(f"   Title: {result1.title}")
    print(f"   Content length: {len(result1.markdown)} chars")
    print(f"   Content type: {result1.content_type}")

    # Check if pattern was learned
    print("\n3️⃣ Checking if pattern was learned...")
    domain = "vnexpress.net"
    patterns = storage.get_patterns(domain)

    if patterns:
        print(f"   ✅ Learned {len(patterns)} pattern(s) for {domain}")
        for p in patterns:
            print(f"      • URL pattern: {p.url_pattern}")
            print(f"      • Quality score: {p.quality_score:.2f}")
            print(f"      • Content selectors: {p.content_selectors[:2]}")
    else:
        print(f"   ⚠️ No patterns learned yet (quality may be too low)")

    # Second crawl — should use learned pattern
    test_url2 = "https://vnexpress.net/di-xe-may-nuoc-ngoai-ve-viet-nam-phai-lam-gi-4854925.html"

    print(f"\n4️⃣ Second crawl — should use learned pattern: {test_url2}")
    result2 = await crawler.scrape(test_url2, auto_extract=True)

    if result2.error:
        print(f"   ❌ Error: {result2.error}")
    else:
        print(f"   ✅ Success!")
        print(f"   Parser used: {result2.parser_used}")
        print(f"   Used learned pattern: {'learned:' in result2.parser_used}")
        print(f"   Title: {result2.title}")
        print(f"   Content length: {len(result2.markdown)} chars")

    # Get learning stats
    print("\n5️⃣ Learning engine statistics:")
    stats = storage.get_stats()

    print(f"   Storage: {stats.get('storage', 'unknown')}")
    print(f"   Cache size: {stats.get('cache_size', 0)} / {stats.get('cache_maxsize', 0)}")
    print(f"   Total patterns: {stats.get('total_patterns', 0)}")
    print(f"   Unique domains: {stats.get('unique_domains', 0)}")
    print(f"   Avg quality: {stats.get('avg_quality', 0):.3f}")

    if stats.get('top_domains'):
        print("\n   Top domains:")
        for domain in stats['top_domains'][:5]:
            print(f"      • {domain.get('domain', 'N/A')}: {domain.get('total_crawls', 0)} crawls, quality {domain.get('avg_quality_score', 0):.2f}")

    print("\n" + "=" * 60)
    print("✅ Learning engine test complete!")
    print("=" * 60)


async def test_api_endpoints():
    """Test learning API endpoints."""
    print("\n" + "=" * 60)
    print("🌐 Testing Learning API Endpoints")
    print("=" * 60)

    base_url = os.getenv("CRAWLKIT_API_URL", "http://localhost:8080")
    master_key = os.getenv("CRAWLKIT_MASTER_KEY", "ck_master_dev")

    headers = {"Authorization": f"Bearer {master_key}"}

    async with httpx.AsyncClient(timeout=30.0) as client:
        # Test learning stats endpoint
        print("\n1️⃣ Testing GET /v1/admin/learning/stats")
        try:
            resp = await client.get(f"{base_url}/v1/admin/learning/stats", headers=headers)
            print(f"   Status: {resp.status_code}")

            if resp.status_code == 200:
                data = resp.json()
                print(f"   ✅ Success:")
                print(f"      {data}")
            else:
                print(f"   ❌ Error: {resp.text}")
        except Exception as e:
            print(f"   ❌ Exception: {e}")

        # Test feedback endpoint
        print("\n2️⃣ Testing POST /v1/feedback")
        try:
            feedback = {
                "url": "https://vnexpress.net/test-article.html",
                "feedback": "good",
                "details": "Extraction was perfect!",
            }
            resp = await client.post(f"{base_url}/v1/feedback", json=feedback, headers=headers)
            print(f"   Status: {resp.status_code}")

            if resp.status_code == 200:
                data = resp.json()
                print(f"   ✅ Success: {data.get('message', 'OK')}")
            else:
                print(f"   ❌ Error: {resp.text}")
        except Exception as e:
            print(f"   ❌ Exception: {e}")

    print("\n" + "=" * 60)
    print("✅ API endpoint tests complete!")
    print("=" * 60)


async def main():
    """Run all tests."""
    # Test 1: Direct learning engine
    await test_learning_flow()

    # Test 2: API endpoints (only if server is running)
    try:
        await test_api_endpoints()
    except Exception as e:
        print(f"\n⚠️ API endpoint tests skipped (server not running): {e}")


if __name__ == "__main__":
    asyncio.run(main())