Server IP Reputation: Cloud/VPS providers (AWS, Azure, etc.) often have IPs flagged as suspicious or part of known data centers.
Headless mode: Even with undetected_chromedriver
, running in headless mode can increase detection likelihood.
Identifiable browser fingerprints: Subtle mismatches in JS APIs, fonts, canvas fingerprinting, or automation indicators.
Repeated queries: Frequent or patterned queries look automated.
Use residential proxies instead of datacenter IPs to mimic real users.
Rotate proxies to avoid rate-limiting.
self.chrome_options.add_argument('--proxy-server=http://user:pass@proxyhost:port')
Headless detection has improved significantly.
parser = YandexParser(USE_GUI=True)
Add mouse movements and scrolling before parsing or clicking.
Use libraries like pyautogui
or manually simulate via Selenium:
from selenium.webdriver.common.action_chains import ActionChains ActionChains(driver).move_by_offset(100, 200).perform() driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
Add random wait times between actions.
Randomize user agents:
from fake_useragent import UserAgent ua = UserAgent() self.chrome_options.add_argument(f'user-agent={ua.random}')
Integrate services like:
These services solve CAPTCHAs using human labor or AI.
time.sleep(random.uniform(10, 30))
after each session or query group.Sometimes mobile versions are less protected:
self.driver.get(f"https://m.ya.ru/search/?text={film_name}")
import undetected_chromedriver as uc
import random
from fake_useragent import UserAgent
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from datetime import datetime
import time
import logging
import traceback
import pathlib
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("yandex_parser.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger('ParserLogger')
class YandexParser():
def __init__(self, USE_GUI=True):
self.ua = UserAgent()
self.chrome_options = uc.ChromeOptions()
if not USE_GUI:
self.chrome_options.add_argument('--headless=new') # headless=new is less detectable
self.chrome_options.add_argument('--no-sandbox')
self.chrome_options.add_argument('--disable-dev-shm-usage')
# Stealth settings
self.chrome_options.add_argument('--disable-blink-features=AutomationControlled')
self.chrome_options.add_argument("--disable-infobars")
self.chrome_options.add_argument("--disable-popup-blocking")
self.chrome_options.add_argument("--start-maximized")
self.chrome_options.add_argument(f"user-agent={self.ua.random}")
# Optional: Add proxy here if you have one
# self.chrome_options.add_argument('--proxy-server=http://user:pass@proxy_host:proxy_port')
self.driver = uc.Chrome(options=self.chrome_options)
# Hide webdriver
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
'''
})
self.driver.get("https://ya.ru/")
self._simulate_human()
time.sleep(random.uniform(2, 4))
def _simulate_human(self):
try:
# Scroll and mouse movement
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight / 3);")
ActionChains(self.driver).move_by_offset(random.randint(5, 200), random.randint(5, 200)).perform()
time.sleep(random.uniform(0.5, 1.5))
except Exception as e:
logger.warning(f"Human simulation failed: {e}")
def close(self):
self.driver.quit()
def check_captcha(self):
cur_time = str(datetime.now()).replace(' ', '_')
if "showcaptcha" in self.driver.current_url:
logger.warning("Captcha found")
self.driver.save_screenshot(f'screens/img_captcha_{cur_time}.png')
try:
button = self.driver.find_element(By.XPATH, "//input[@class='CheckboxCaptcha-Button']")
button.click()
logger.info("Captcha checkbox clicked")
time.sleep(random.uniform(1.5, 3))
self.driver.save_screenshot(f'screens/img_captcha_afterclick_{cur_time}.png')
except Exception as e:
logger.warning("Captcha click failed.")
else:
self.driver.save_screenshot(f'screens/img_{cur_time}.png')
def parse(self, film_name: str):
logger.info(f"Start parsing: {film_name}")
result_urls = []
try:
self.driver.get(f"https://ya.ru/search/?text={film_name}&lr=213&search_source=yaru_desktop_common&search_domain=yaru")
self._simulate_human()
self.check_captcha()
for i in range(1, 5):
result_urls.extend(self.parse_page(i))
self.get_next_page()
self._simulate_human()
self.check_captcha()
time.sleep(random.uniform(2, 5))
except Exception:
logger.error(f"Exception: {traceback.format_exc()}")
finally:
logger.info(f"Found {len(result_urls)} results for '{film_name}': {result_urls}")
def parse_page(self, page_id):
res = []
try:
urls_raw = self.driver.find_elements(By.XPATH, '//a[@class="Link Link_theme_normal OrganicTitle-Link organic__url link"]')
for url_raw in urls_raw:
href = url_raw.get_attribute("href")
if href and "yabs.yandex.ru" not in href:
res.append(href)
logger.info(f"Found {len(res)} URLs on page {page_id}")
except Exception:
logger.warning(f"Could not parse page {page_id}")
return res
def get_next_page(self):
try:
next_btn = self.driver.find_elements(By.XPATH, '//div[@class="Pager-ListItem Pager-ListItem_type_next"]')
if next_btn:
next_btn[0].click()
time.sleep(random.uniform(3, 6))
except Exception as e:
logger.warning(f"Next page navigation failed: {e}")
if __name__ == "__main__":
pathlib.Path('screens/').mkdir(exist_ok=True)
parser = YandexParser(USE_GUI=True) # GUI mode for better stealth
films = ["Терминатор смотреть", "Саша Таня смотреть", "Джон Уик смотреть онлайн"]
idx = 0
try:
while True:
film = films[idx]
idx = (idx + 1) % len(films)
parser.parse(film)
time.sleep(random.uniform(8, 15))
except Exception as e:
logger.error(f"Fatal error: {e}")
finally:
parser.close()