-
Notifications
You must be signed in to change notification settings - Fork 168
Expand file tree
/
Copy pathget_cookies.py
More file actions
219 lines (191 loc) · 8.48 KB
/
get_cookies.py
File metadata and controls
219 lines (191 loc) · 8.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import asyncio
import random
import httpx
from loguru import logger
from playwright.async_api import async_playwright
from playwright_stealth import Stealth
from typing import Optional, Dict, List
from dto import Proxy, ProxySplit
from playwright_setup import ensure_playwright_installed
MAX_RETRIES = 3
RETRY_DELAY = 10
RETRY_DELAY_WITHOUT_PROXY = 300
BAD_IP_TITLE = "проблема с ip"
class PlaywrightClient:
def __init__(
self,
proxy: Proxy = None,
headless: bool = True,
user_agent: Optional[str] = None,
stop_event=None
):
self.proxy = proxy
self.proxy_split_obj = self.get_proxy_obj()
self.headless = headless
self.user_agent = user_agent or "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
self.context = self.page = self.browser = None
self.stop_event = stop_event
@staticmethod
def check_protocol(ip_port: str) -> str:
if "http://" not in ip_port:
return f"http://{ip_port}"
return ip_port
@staticmethod
def del_protocol(proxy_string: str):
if "//" in proxy_string:
return proxy_string.split("//")[1]
return proxy_string
def get_proxy_obj(self) -> ProxySplit | None:
if not self.proxy:
return None
try:
self.proxy.proxy_string = self.del_protocol(proxy_string=self.proxy.proxy_string)
if "@" in self.proxy.proxy_string:
ip_port, user_pass = self.proxy.proxy_string.split("@")
if "." in user_pass:
ip_port, user_pass = user_pass, ip_port
login, password = str(user_pass).split(":")
else:
login, password, ip, port = self.proxy.proxy_string.split(":")
if "." in login:
login, password, ip, port = ip, port, login, password
ip_port = f"{ip}:{port}"
ip_port = self.check_protocol(ip_port=ip_port)
return ProxySplit(
ip_port=ip_port,
login=login,
password=password,
change_ip_link=self.proxy.change_ip_link
)
except Exception as err:
logger.error(err)
logger.critical("Прокси в таком формате не поддерживаются. "
"Используй: ip:port@user:pass или ip:port:user:pass")
@staticmethod
def parse_cookie_string(cookie_str: str) -> dict:
return dict(pair.split("=", 1) for pair in cookie_str.split("; ") if "=" in pair)
async def launch_browser(self):
ensure_playwright_installed("chromium")
stealth = Stealth()
self.playwright_context = stealth.use_async(async_playwright())
playwright = await self.playwright_context.__aenter__()
self.playwright = playwright
launch_args = {
"headless": self.headless,
"chromium_sandbox": False,
"args": [
"--disable-blink-features=AutomationControlled",
"--no-sandbox",
"--disable-dev-shm-usage",
"--start-maximized",
"--window-size=1920,1080",
]
}
self.browser = await playwright.chromium.launch(**launch_args)
context_args = {
"user_agent": self.user_agent,
"viewport": {"width": 1920, "height": 1080},
"screen": {"width": 1920, "height": 1080},
"device_scale_factor": 1,
"is_mobile": False,
"has_touch": False,
}
if self.proxy_split_obj:
context_args["proxy"] = {
"server": self.proxy_split_obj.ip_port,
"username": self.proxy_split_obj.login,
"password": self.proxy_split_obj.password
}
self.context = await self.browser.new_context(**context_args)
self.page = await self.context.new_page()
# block images, not use now
# await self.page.route("**/*", lambda route, request: asyncio.create_task(self._block_images(route, request)))
await self._stealth(self.page)
async def load_page(self, url: str):
await self.page.goto(url=url,
timeout=60_000,
wait_until="domcontentloaded")
for attempt in range(10):
if self.stop_event and self.stop_event.is_set():
return {}
await self.check_block(self.page, self.context)
raw_cookie = await self.page.evaluate("() => document.cookie")
cookie_dict = self.parse_cookie_string(raw_cookie)
if cookie_dict.get("ft"):
logger.info("Cookies получены")
return cookie_dict
await asyncio.sleep(5)
logger.warning("Не удалось получить cookies")
return {}
async def extract_cookies(self, url: str) -> dict:
try:
await self.launch_browser()
return await self.load_page(url)
finally:
if hasattr(self, "browser"):
if self.browser:
await self.browser.close()
if hasattr(self, "playwright"):
await self.playwright.stop()
if hasattr(self, "playwright_context") and self.playwright_context:
await self.playwright_context.__aexit__(None, None, None)
async def get_cookies(self, url: str) -> dict:
return await self.extract_cookies(url)
async def check_block(self, page, context):
title = await page.title()
logger.info(f"Не ошибка, а название страницы: {title}")
if BAD_IP_TITLE in str(title).lower():
logger.info("IP заблокирован")
await context.clear_cookies()
await self.change_ip()
await page.reload(timeout=60 * 1000)
async def change_ip(self, retries: int = MAX_RETRIES):
if not self.proxy_split_obj:
logger.info("Сейчас бы сменили ip, но прокси нет - поэтому ждем")
for i in range(RETRY_DELAY_WITHOUT_PROXY):
if self.stop_event and self.stop_event.is_set():
return False
await asyncio.sleep(1)
return False
for attempt in range(1, retries + 1):
try:
response = httpx.get(self.proxy_split_obj.change_ip_link + "&format=json", timeout=20)
if response.status_code == 200:
logger.info(f"IP изменён на {response.json().get('new_ip')}")
return True
else:
logger.warning(f"[{attempt}/{retries}] Ошибка смены IP: {response.status_code}")
except httpx.RequestError as e:
logger.error(f"[{attempt}/{retries}] Ошибка смены IP: {e}")
if attempt < retries:
logger.info(f"Повторная попытка сменить IP через {RETRY_DELAY} секунд...")
await asyncio.sleep(RETRY_DELAY)
else:
logger.error("Превышено количество попыток смены IP")
return False
@staticmethod
async def _stealth(page):
await page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
Object.defineProperty(navigator, 'platform', { get: () => 'Win32' });
Object.defineProperty(navigator, 'vendor', { get: () => 'Google Inc.' });
window.chrome = { runtime: {} };
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] });
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
""")
@staticmethod
async def _block_images(route, request):
if request.resource_type == "image":
await route.abort()
else:
await route.continue_()
async def get_cookies(proxy: Proxy = None, headless: bool = True, stop_event=None) -> tuple:
logger.info("Пытаюсь обновить cookies")
client = PlaywrightClient(
proxy=proxy,
headless=headless,
stop_event=stop_event
)
ads_id = str(random.randint(1111111111, 9999999999))
cookies = await client.get_cookies(f"https://www.avito.ru/{ads_id}")
return cookies, client.user_agent