forked from brightdata/sdk-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclient.py
More file actions
532 lines (447 loc) · 18.9 KB
/
client.py
File metadata and controls
532 lines (447 loc) · 18.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
"""
Main Bright Data SDK client - Single entry point for all services.
Philosophy:
- Client is the single source of truth for configuration
- Authentication should "just work" with minimal setup
- Fail fast and clearly when credentials are missing/invalid
- Follow principle of least surprise - common patterns from other SDKs
"""
import os
import asyncio
import warnings
from typing import Optional, Dict, Any, Union, List
from datetime import datetime, timezone
try:
from dotenv import load_dotenv
load_dotenv()
except ImportError:
pass
from .core.engine import AsyncEngine
from .core.zone_manager import ZoneManager
from .api.web_unlocker import WebUnlockerService
from .api.scrape_service import ScrapeService
from .api.search_service import SearchService
from .api.crawler_service import CrawlerService
from .models import ScrapeResult
from .types import AccountInfo
from .constants import (
HTTP_OK,
HTTP_UNAUTHORIZED,
HTTP_FORBIDDEN,
)
from .exceptions import ValidationError, AuthenticationError, APIError
class BrightDataClient:
"""
Main entry point for Bright Data SDK.
Single, unified interface for all BrightData services including scraping,
search, and crawling capabilities. Handles authentication, configuration,
and provides hierarchical access to specialized services.
Examples:
>>> # Simple instantiation - auto-loads from environment
>>> client = BrightDataClient()
>>>
>>> # Explicit token
>>> client = BrightDataClient(token="your_api_token")
>>>
>>> # Service access (planned)
>>> client.scrape.amazon.products(...)
>>> client.search.linkedin.jobs(...)
>>> client.crawler.discover(...)
>>>
>>> # Connection verification
>>> is_valid = await client.test_connection()
>>> info = await client.get_account_info()
"""
# Default configuration
DEFAULT_TIMEOUT = 30
DEFAULT_WEB_UNLOCKER_ZONE = "sdk_unlocker"
DEFAULT_SERP_ZONE = "sdk_serp"
DEFAULT_BROWSER_ZONE = "sdk_browser"
# Environment variable name for API token
TOKEN_ENV_VAR = "BRIGHTDATA_API_TOKEN"
def __init__(
self,
token: Optional[str] = None,
timeout: int = DEFAULT_TIMEOUT,
web_unlocker_zone: Optional[str] = None,
serp_zone: Optional[str] = None,
browser_zone: Optional[str] = None,
auto_create_zones: bool = True,
validate_token: bool = False,
rate_limit: Optional[float] = None,
rate_period: float = 1.0,
):
"""
Initialize Bright Data client.
Authentication happens automatically from environment variables if not provided.
Supports loading from .env files (requires python-dotenv package).
Args:
token: API token. If None, loads from BRIGHTDATA_API_TOKEN environment variable
(supports .env files via python-dotenv)
timeout: Default timeout in seconds for all requests (default: 30)
web_unlocker_zone: Zone name for web unlocker (default: "sdk_unlocker")
serp_zone: Zone name for SERP API (default: "sdk_serp")
browser_zone: Zone name for browser API (default: "sdk_browser")
auto_create_zones: Automatically create zones if they don't exist (default: True)
validate_token: Validate token by testing connection on init (default: False)
rate_limit: Maximum requests per rate_period (default: 10). Set to None to disable.
rate_period: Time period in seconds for rate limit (default: 1.0)
Raises:
ValidationError: If token is not provided and not found in environment
AuthenticationError: If validate_token=True and token is invalid
Example:
>>> # Auto-load from environment
>>> client = BrightDataClient()
>>>
>>> # Explicit configuration
>>> client = BrightDataClient(
... token="your_token",
... timeout=60,
... validate_token=True
... )
"""
self.token = self._load_token(token)
self.timeout = timeout
self.web_unlocker_zone = web_unlocker_zone or self.DEFAULT_WEB_UNLOCKER_ZONE
self.serp_zone = serp_zone or self.DEFAULT_SERP_ZONE
self.browser_zone = browser_zone or self.DEFAULT_BROWSER_ZONE
self.auto_create_zones = auto_create_zones
self.engine = AsyncEngine(
self.token, timeout=timeout, rate_limit=rate_limit, rate_period=rate_period
)
self._scrape_service: Optional[ScrapeService] = None
self._search_service: Optional[SearchService] = None
self._crawler_service: Optional[CrawlerService] = None
self._web_unlocker_service: Optional[WebUnlockerService] = None
self._zone_manager: Optional[ZoneManager] = None
self._is_connected = False
self._account_info: Optional[Dict[str, Any]] = None
self._zones_ensured = False
# Store for validation during __aenter__
self._validate_token_on_enter = validate_token
def _ensure_initialized(self) -> None:
"""
Ensure client is properly initialized (used as context manager).
Raises:
RuntimeError: If client not initialized via context manager
"""
if self.engine._session is None:
raise RuntimeError(
"BrightDataClient not initialized. "
"Use: async with BrightDataClient() as client: ..."
)
def _load_token(self, token: Optional[str]) -> str:
"""
Load token from parameter or environment variable.
Fails fast with clear error message if no token found.
Args:
token: Explicit token (takes precedence)
Returns:
Valid token string
Raises:
ValidationError: If no token found
"""
if token:
if not isinstance(token, str) or len(token.strip()) < 10:
raise ValidationError(
f"Invalid token format. Token must be a string with at least 10 characters. "
f"Got: {type(token).__name__} with length {len(str(token))}"
)
return token.strip()
# Try loading from environment variable
env_token = os.getenv(self.TOKEN_ENV_VAR)
if env_token:
return env_token.strip()
# No token found - fail fast with helpful message
raise ValidationError(
f"API token required but not found.\n\n"
f"Provide token in one of these ways:\n"
f" 1. Pass as parameter: BrightDataClient(token='your_token')\n"
f" 2. Set environment variable: {self.TOKEN_ENV_VAR}\n\n"
f"Get your API token from: https://brightdata.com/cp/api_keys"
)
async def _ensure_zones(self) -> None:
"""
Ensure required zones exist if auto_create_zones is enabled.
This is called automatically before the first API request.
Only runs once per client instance.
Raises:
ZoneError: If zone creation fails
AuthenticationError: If API token lacks permissions
"""
if self._zones_ensured or not self.auto_create_zones:
return
if self._zone_manager is None:
self._zone_manager = ZoneManager(self.engine)
# Don't pass browser_zone to auto-creation because browser zones
# require additional configuration and cannot be auto-created
await self._zone_manager.ensure_required_zones(
web_unlocker_zone=self.web_unlocker_zone,
serp_zone=self.serp_zone,
browser_zone=None, # Never auto-create browser zones
)
self._zones_ensured = True
@property
def scrape(self) -> ScrapeService:
"""
Access scraping services.
Provides hierarchical access to specialized scrapers:
- client.scrape.amazon.products(...)
- client.scrape.linkedin.profiles(...)
- client.scrape_url(...)
Returns:
ScrapeService instance for accessing scrapers
Example:
>>> result = client.scrape.amazon.products(
... url="https://amazon.com/dp/B0123456"
... )
"""
if self._scrape_service is None:
self._scrape_service = ScrapeService(self)
return self._scrape_service
@property
def search(self) -> SearchService:
"""
Access search services (SERP API).
Provides access to search engine result scrapers:
- client.search.google(query="...")
- client.search.bing(query="...")
- client.search.linkedin.jobs(...)
Returns:
SearchService instance for search operations
Example:
>>> results = client.search.google(
... query="python scraping",
... num_results=10
... )
"""
if self._search_service is None:
self._search_service = SearchService(self)
return self._search_service
@property
def crawler(self) -> CrawlerService:
"""
Access web crawling services.
Provides access to domain crawling capabilities:
- client.crawler.discover(url="...")
- client.crawler.sitemap(url="...")
Returns:
CrawlerService instance for crawling operations
Example:
>>> result = client.crawler.discover(
... url="https://example.com",
... depth=3
... )
"""
if self._crawler_service is None:
self._crawler_service = CrawlerService(self)
return self._crawler_service
async def test_connection(self) -> bool:
"""
Test API connection and token validity.
Makes a lightweight API call to verify:
- Token is valid
- API is reachable
- Account is active
Returns:
True if connection successful, False otherwise (never raises exceptions)
Note:
This method never raises exceptions - it returns False for any errors
(invalid token, network issues, etc.). This makes it safe for testing
connectivity without exception handling.
Client must be used as context manager before calling this method.
Example:
>>> async with BrightDataClient() as client:
... is_valid = await client.test_connection()
... if is_valid:
... print("Connected successfully!")
"""
self._ensure_initialized()
try:
async with self.engine.get_from_url(
f"{self.engine.BASE_URL}/zone/get_active_zones"
) as response:
if response.status == HTTP_OK:
self._is_connected = True
return True
else:
self._is_connected = False
return False
except (asyncio.TimeoutError, OSError, Exception):
self._is_connected = False
return False
async def get_account_info(self, refresh: bool = False) -> AccountInfo:
"""
Get account information including usage, limits, and quotas.
Note: This method caches the result by default. For fresh zone data,
use list_zones() instead, or pass refresh=True.
Retrieves:
- Account status
- Active zones
- Usage statistics
- Credit balance
- Rate limits
Args:
refresh: If True, bypass cache and fetch fresh data (default: False)
Returns:
Dictionary with account information
Raises:
AuthenticationError: If token is invalid
APIError: If API request fails
Example:
>>> # Cached version (fast)
>>> info = await client.get_account_info()
>>> print(f"Active zones: {len(info['zones'])}")
>>> # Fresh data (use this after creating/deleting zones)
>>> info = await client.get_account_info(refresh=True)
>>> print(f"Active zones: {len(info['zones'])}")
>>> # Or better: use list_zones() for current zone list
>>> zones = await client.list_zones()
"""
if self._account_info is not None and not refresh:
return self._account_info
self._ensure_initialized()
try:
async with self.engine.get_from_url(
f"{self.engine.BASE_URL}/zone/get_active_zones"
) as zones_response:
if zones_response.status == HTTP_OK:
zones = await zones_response.json()
zones = zones or []
# Warn user if no active zones found (they might be inactive)
if not zones:
warnings.warn(
"No active zones found. This could mean:\n"
"1. Your zones might be inactive - activate them in the Bright Data dashboard\n"
"2. You might need to create zones first\n"
"3. Check your dashboard at https://brightdata.com for zone status\n\n"
"Note: The API only returns active zones. Inactive zones won't appear here.",
UserWarning,
stacklevel=2,
)
account_info = {
"zones": zones,
"zone_count": len(zones),
"token_valid": True,
"retrieved_at": datetime.now(timezone.utc).isoformat(),
}
self._account_info = account_info
return account_info
elif zones_response.status in (HTTP_UNAUTHORIZED, HTTP_FORBIDDEN):
error_text = await zones_response.text()
raise AuthenticationError(
f"Invalid token (HTTP {zones_response.status}): {error_text}"
)
else:
error_text = await zones_response.text()
raise APIError(
f"Failed to get account info (HTTP {zones_response.status}): {error_text}",
status_code=zones_response.status,
)
except (AuthenticationError, APIError):
raise
except Exception as e:
raise APIError(f"Unexpected error getting account info: {str(e)}")
async def list_zones(self) -> List[Dict[str, Any]]:
"""
List all active zones in your Bright Data account.
Returns:
List of zone dictionaries with their configurations
Raises:
ZoneError: If zone listing fails
AuthenticationError: If authentication fails
Example:
>>> async with BrightDataClient() as client:
... zones = await client.list_zones()
... print(f"Found {len(zones)} zones")
... for zone in zones:
... print(f" - {zone['name']}: {zone.get('type', 'unknown')}")
"""
self._ensure_initialized()
if self._zone_manager is None:
self._zone_manager = ZoneManager(self.engine)
return await self._zone_manager.list_zones()
async def delete_zone(self, zone_name: str) -> None:
"""
Delete a zone from your Bright Data account.
Args:
zone_name: Name of the zone to delete
Raises:
ZoneError: If zone deletion fails or zone doesn't exist
AuthenticationError: If authentication fails
APIError: If API request fails
Example:
>>> # Delete a test zone
>>> await client.delete_zone("test_zone_123")
>>> print("Zone deleted successfully")
>>> # With error handling
>>> try:
... await client.delete_zone("my_zone")
... except ZoneError as e:
... print(f"Failed to delete zone: {e}")
"""
self._ensure_initialized()
if self._zone_manager is None:
self._zone_manager = ZoneManager(self.engine)
await self._zone_manager.delete_zone(zone_name)
async def scrape_url(
self,
url: Union[str, List[str]],
zone: Optional[str] = None,
country: str = "",
response_format: str = "raw",
method: str = "GET",
timeout: Optional[int] = None,
mode: str = "sync",
poll_interval: int = 2,
poll_timeout: int = 30,
) -> Union[ScrapeResult, List[ScrapeResult]]:
"""
Direct scraping method (flat API).
For backward compatibility. Prefer using hierarchical API:
client.scrape_url(...) for new code.
Args:
url: Single URL or list of URLs to scrape
zone: Zone name (uses web_unlocker_zone if not provided)
country: Country code for proxy location
response_format: "raw" for HTML or "json" for structured data
method: HTTP method (default: GET)
timeout: Request timeout in seconds
mode: "sync" (default, blocking) or "async" (non-blocking with polling)
poll_interval: Seconds between polls (async mode only, default: 2)
poll_timeout: Max wait time in seconds (async mode only, default: 30)
"""
self._ensure_initialized()
if self._web_unlocker_service is None:
self._web_unlocker_service = WebUnlockerService(self.engine)
zone = zone or self.web_unlocker_zone
return await self._web_unlocker_service.scrape_async(
url=url,
zone=zone,
country=country,
response_format=response_format,
method=method,
timeout=timeout,
mode=mode,
poll_interval=poll_interval,
poll_timeout=poll_timeout,
)
async def __aenter__(self):
"""Async context manager entry."""
await self.engine.__aenter__()
# Validate token if requested
if self._validate_token_on_enter:
is_valid = await self.test_connection()
if not is_valid:
await self.engine.__aexit__(None, None, None)
raise AuthenticationError("Token validation failed. Please check your API token.")
await self._ensure_zones()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit."""
await self.engine.__aexit__(exc_type, exc_val, exc_tb)
def __repr__(self) -> str:
"""String representation for debugging."""
token_preview = f"{self.token[:10]}...{self.token[-5:]}" if self.token else "None"
status = "Connected" if self._is_connected else "Not tested"
return f"<BrightDataClient token={token_preview} status='{status}'>"