import requests from bs4 import BeautifulSoup from urllib.parse import urljoin visited = set() def crawl(start_url, limit=20): urls = [start_url] pages = [] while urls and len(pages) < limit: url = urls.pop(0) if url in visited: continue try: r = requests.get(url, timeout=5) soup = BeautifulSoup(r.text, "html.parser") text = soup.get_text() pages.append({ "url": url, "content": text }) visited.add(url) for link in soup.find_all("a", href=True): new_url = urljoin(url, link["href"]) if new_url.startswith("http"): urls.append(new_url) except: pass return pages import json import re def tokenize(text): return re.findall(r'\w+', text.lower()) def build_index(pages): index = {} for page in pages: words = tokenize(page["content"]) for word in words: if word not in index: index[word] = [] if page["url"] not in index[word]: index[word].append(page["url"]) with open("index.json", "w") as f: json.dump(index, f) print("Index built with", len(index), "terms") from flask import Flask, request from search import search app = Flask(__name__) @app.route("/") def home(): return """