79637420

Date: 2025-05-25 06:32:44
Score: 1
Natty:
Report link

🔍 Root Causes

  1. Server IP Reputation: Cloud/VPS providers (AWS, Azure, etc.) often have IPs flagged as suspicious or part of known data centers.

  2. Headless mode: Even with undetected_chromedriver, running in headless mode can increase detection likelihood.

  3. Identifiable browser fingerprints: Subtle mismatches in JS APIs, fonts, canvas fingerprinting, or automation indicators.

  4. Repeated queries: Frequent or patterned queries look automated.


✅ Recommendations to Avoid CAPTCHA

1. Use Proxies (Residential > Datacenter)



Use residential proxies instead of datacenter IPs to mimic real users.

Rotate proxies to avoid rate-limiting.

self.chrome_options.add_argument('--proxy-server=http://user:pass@proxyhost:port')

2. Run in Non-Headless Mode (especially on the server)

Headless detection has improved significantly.

parser = YandexParser(USE_GUI=True)

3. Simulate Human Interaction Better

4. Randomize Behavior More

5. Use CAPTCHA Solvers

6. Delay Startup and Query Frequency

7. Use Yandex's Mobile Version or Alternate Domains

import undetected_chromedriver as uc
import random
from fake_useragent import UserAgent
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from datetime import datetime
import time
import logging
import traceback
import pathlib

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("yandex_parser.log"),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger('ParserLogger')


class YandexParser():
    def __init__(self, USE_GUI=True):
        self.ua = UserAgent()
        self.chrome_options = uc.ChromeOptions()
        if not USE_GUI:
            self.chrome_options.add_argument('--headless=new')  # headless=new is less detectable
            self.chrome_options.add_argument('--no-sandbox')
            self.chrome_options.add_argument('--disable-dev-shm-usage')

        # Stealth settings
        self.chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        self.chrome_options.add_argument("--disable-infobars")
        self.chrome_options.add_argument("--disable-popup-blocking")
        self.chrome_options.add_argument("--start-maximized")
        self.chrome_options.add_argument(f"user-agent={self.ua.random}")

        # Optional: Add proxy here if you have one
        # self.chrome_options.add_argument('--proxy-server=http://user:pass@proxy_host:proxy_port')

        self.driver = uc.Chrome(options=self.chrome_options)

        # Hide webdriver
        self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
            'source': '''
                Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
            '''
        })

        self.driver.get("https://ya.ru/")
        self._simulate_human()
        time.sleep(random.uniform(2, 4))

    def _simulate_human(self):
        try:
            # Scroll and mouse movement
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight / 3);")
            ActionChains(self.driver).move_by_offset(random.randint(5, 200), random.randint(5, 200)).perform()
            time.sleep(random.uniform(0.5, 1.5))
        except Exception as e:
            logger.warning(f"Human simulation failed: {e}")

    def close(self):
        self.driver.quit()

    def check_captcha(self):
        cur_time = str(datetime.now()).replace(' ', '_')
        if "showcaptcha" in self.driver.current_url:
            logger.warning("Captcha found")
            self.driver.save_screenshot(f'screens/img_captcha_{cur_time}.png')

            try:
                button = self.driver.find_element(By.XPATH, "//input[@class='CheckboxCaptcha-Button']")
                button.click()
                logger.info("Captcha checkbox clicked")
                time.sleep(random.uniform(1.5, 3))
                self.driver.save_screenshot(f'screens/img_captcha_afterclick_{cur_time}.png')
            except Exception as e:
                logger.warning("Captcha click failed.")
        else:
            self.driver.save_screenshot(f'screens/img_{cur_time}.png')

    def parse(self, film_name: str):
        logger.info(f"Start parsing: {film_name}")
        result_urls = []
        try:
            self.driver.get(f"https://ya.ru/search/?text={film_name}&lr=213&search_source=yaru_desktop_common&search_domain=yaru")
            self._simulate_human()
            self.check_captcha()

            for i in range(1, 5):
                result_urls.extend(self.parse_page(i))
                self.get_next_page()
                self._simulate_human()
                self.check_captcha()
                time.sleep(random.uniform(2, 5))
        except Exception:
            logger.error(f"Exception: {traceback.format_exc()}")
        finally:
            logger.info(f"Found {len(result_urls)} results for '{film_name}': {result_urls}")

    def parse_page(self, page_id):
        res = []
        try:
            urls_raw = self.driver.find_elements(By.XPATH, '//a[@class="Link Link_theme_normal OrganicTitle-Link organic__url link"]')
            for url_raw in urls_raw:
                href = url_raw.get_attribute("href")
                if href and "yabs.yandex.ru" not in href:
                    res.append(href)
            logger.info(f"Found {len(res)} URLs on page {page_id}")
        except Exception:
            logger.warning(f"Could not parse page {page_id}")
        return res

    def get_next_page(self):
        try:
            next_btn = self.driver.find_elements(By.XPATH, '//div[@class="Pager-ListItem Pager-ListItem_type_next"]')
            if next_btn:
                next_btn[0].click()
                time.sleep(random.uniform(3, 6))
        except Exception as e:
            logger.warning(f"Next page navigation failed: {e}")


if __name__ == "__main__":
    pathlib.Path('screens/').mkdir(exist_ok=True)
    parser = YandexParser(USE_GUI=True)  # GUI mode for better stealth

    films = ["Терминатор смотреть", "Саша Таня смотреть", "Джон Уик смотреть онлайн"]
    idx = 0

    try:
        while True:
            film = films[idx]
            idx = (idx + 1) % len(films)
            parser.parse(film)
            time.sleep(random.uniform(8, 15))
    except Exception as e:
        logger.error(f"Fatal error: {e}")
    finally:
        parser.close()
Reasons:
  • RegEx Blacklisted phrase (1.5): Reputation
  • Long answer (-1):
  • Has code block (-0.5):
  • Low reputation (1):
Posted by: Keshav Khandelwal