79668882

Date: 2025-06-17 10:18:39
Score: 3
Natty:
Report link

this is also my code.. and we have the same problem

\# ai\_firefox\_scraper.py (Fixed)

import asyncio

import json

import os

import csv

import time

import random

from pathlib import Path

from playwright.async\_api import async\_playwright

SAVE\_DIR = Path("scraped\_data")

SAVE\_DIR.mkdir(exist\_ok=True)

class FirefoxSmartScraper:

def \_\_init\_\_(self, max\_pages=5, throttle=(4, 8)):

self.max\_pages = max\_pages

self.throttle = throttle

async def search\_and\_scrape(self, topic: str):

async with async\_playwright() as p:

browser = await p.firefox.launch(headless=False)

context = await browser.new\_context()

page = await context.new\_page()

print(f"šŸ” Searching DuckDuckGo for: {topic}")

await page.goto("https://duckduckgo.com", timeout=30000)

await page.wait\_for\_selector("input\[name='q'\]")

\# Type like a human

for c in topic:

await page.type("input\[name='q'\]", c, delay=random.randint(100, 200))

await page.keyboard.press("Enter")

await page.wait\_for\_selector("a.result\_\_a", timeout=20000)

await asyncio.sleep(random.uniform(\*self.throttle))

\# Extract real links only

items = await page.query\_selector\_all("a.result\_\_a")

urls = \[\]

for item in items\[:self.max\_pages\]:

try:

title = await item.inner\_text()

href = await item.get\_attribute("href")

\# Ensure it's a valid URL

if href and href.startswith("http"):

urls.append({"title": title.strip(), "url": href})

except Exception as e:

print(f"\[!\] Failed to parse link: {e}")

continue

if not urls:

print("āŒ No links found.")

await browser.close()

return

print(f"šŸ”— Visiting {len(urls)} pages...")

scraped = \[\]

for idx, link in enumerate(urls):

print(f"\\nšŸ“„ \[{idx+1}\] {link\['title'\]}")

try:

await page.goto(link\["url"\], timeout=30000)

await asyncio.sleep(random.uniform(\*self.throttle))

content = await page.text\_content("body")

scraped.append({

"title": link\["title"\],

"url": link\["url"\],

"content": content\[:1500\] # Limit content

})

except Exception as e:

print(f"\[!\] Failed to scrape: {link\['url'\]}\\nReason: {e}")

continue

await browser.close()

self.save\_data(topic, scraped)

def save\_data(self, topic: str, data: list):

filename\_json = SAVE\_DIR / f"{topic.replace(' ', '\_')}\_data.json"

filename\_csv = SAVE\_DIR / f"{topic.replace(' ', '\_')}\_data.csv"

\# Save as JSON

with open(filename\_json, "w", encoding="utf-8") as f:

json.dump(data, f, ensure\_ascii=False, indent=2)

\# Save as CSV

with open(filename\_csv, "w", newline="", encoding="utf-8") as f:

writer = csv.DictWriter(f, fieldnames=\["title", "url", "content"\])

writer.writeheader()

for entry in data:

writer.writerow(entry)

print(f"\\nāœ… Saved {len(data)} entries to:\\n- {filename\_json}\\n- {filename\_csv}")

def main():

topic = input("šŸ”Ž Enter topic to crawl web for data: ").strip()

if not topic:

print("āŒ No topic entered.")

return

scraper = FirefoxSmartScraper()

asyncio.run(scraper.search\_and\_scrape(topic))

if \_\_name\_\_ == "\_\_main\_\_":

main()

this is my code in making an overall web scrapping.. i don't know whats wrong, it doesn't fetch data in the internet. or maybe websites are really protected

Reasons:
  • Long answer (-1):
  • No code block (0.5):
  • Me too answer (2.5): have the same problem
  • Low reputation (1):
Posted by: Oliver Gonzales