Basic Contact Extraction
Overviewā
The most common SpiderIQ use case: extract contact information from company websites without using any AI tokens. This guide shows you how to get emails, phone numbers, addresses, and social media profiles in 15-30 seconds.
Zero AI costs: This approach uses 0 AI tokens - you only pay for the crawling infrastructure.
What You Getā
With basic contact extraction, SpiderIQ automatically finds and structures:
Email Addresses
Filtered and validated emails (tracking emails removed)
Phone Numbers
All formats detected and normalized
Physical Addresses
Full street addresses extracted
Social Media
14 platforms: LinkedIn, Twitter, Facebook, Instagram, YouTube, GitHub, and more
Plus: Markdown Compendiumā
Every crawl includes a smart markdown summary of the website (configurable from 30% to 100% of original size), giving you full transparency of what was found.
Quick Startā
1. Submit a Job (Minimal Request)ā
The simplest possible request - just a URL:
- cURL
- Python
- JavaScript
curl -X POST https://spideriq.ai/api/v1/jobs/spiderSite/submit \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"payload": {
"url": "https://example.com"
}
}'
import requests
response = requests.post(
"https://spideriq.ai/api/v1/jobs/spiderSite/submit",
headers={"Authorization": "Bearer YOUR_API_KEY"},
json={
"payload": {
"url": "https://example.com"
}
}
)
job = response.json()
job_id = job['job_id']
print(f"Job submitted: {job_id}")
const response = await fetch(
'https://spideriq.ai/api/v1/jobs/spiderSite/submit',
{
method: 'POST',
headers: {
'Authorization': 'Bearer YOUR_API_KEY',
'Content-Type': 'application/json'
},
body: JSON.stringify({
payload: {
url: 'https://example.com'
}
})
}
);
const job = await response.json();
console.log('Job ID:', job.job_id);
Response:
{
"job_id": "974ceeda-84fe-4634-bdcd-adc895c6bc75",
"type": "spiderSite",
"status": "queued",
"created_at": "2025-10-27T14:30:00Z",
"from_cache": false,
"message": "SpiderSite job queued successfully. Estimated processing time: 15-30 seconds."
}
2. Poll for Resultsā
Wait 15-30 seconds, then retrieve the results:
- cURL
- Python
- JavaScript
curl https://spideriq.ai/api/v1/jobs/{job_id}/results \
-H "Authorization: Bearer YOUR_API_KEY"
import time
def wait_for_results(job_id, api_key, max_wait=120):
"""Poll until job completes (max 2 minutes)"""
url = f"https://spideriq.ai/api/v1/jobs/{job_id}/results"
headers = {"Authorization": f"Bearer {api_key}"}
for i in range(max_wait // 3):
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json() # Success!
elif response.status_code == 202:
print(f"Processing... ({i*3}s elapsed)")
time.sleep(3)
elif response.status_code == 410:
raise Exception(f"Job failed: {response.json()['error_message']}")
else:
response.raise_for_status()
raise TimeoutError("Job did not complete in 2 minutes")
# Usage
results = wait_for_results(job_id, "YOUR_API_KEY")
print("Contact info:", results['data'])
async function waitForResults(jobId, apiKey, maxWait = 120000) {
const url = `https://spideriq.ai/api/v1/jobs/${jobId}/results`;
const headers = { 'Authorization': `Bearer ${apiKey}` };
const startTime = Date.now();
while (Date.now() - startTime < maxWait) {
const response = await fetch(url, { headers });
if (response.status === 200) {
return await response.json(); // Success!
} else if (response.status === 202) {
console.log('Processing...');
await new Promise(resolve => setTimeout(resolve, 3000));
} else if (response.status === 410) {
const error = await response.json();
throw new Error(`Job failed: ${error.error_message}`);
} else {
throw new Error(`Unexpected status: ${response.status}`);
}
}
throw new Error('Job did not complete in 2 minutes');
}
// Usage
const results = await waitForResults(job.job_id, 'YOUR_API_KEY');
console.log('Contact info:', results.data);
3. Extract Contact Dataā
Here's what a typical response looks like:
{
"success": true,
"job_id": "974ceeda-84fe-4634-bdcd-adc895c6bc75",
"type": "spiderSite",
"status": "completed",
"processing_time_seconds": 12.4,
"worker_id": "spider-site-main-1",
"completed_at": "2025-10-27T14:30:15Z",
"data": {
"url": "https://example.com",
"pages_crawled": 8,
"crawl_status": "success",
// Contact Information (Flat Structure)
"emails": [
"contact@example.com",
"sales@example.com",
"support@example.com"
],
"phones": [
"+1-555-123-4567",
"+1-800-555-0100"
],
"addresses": [
"123 Main St, San Francisco, CA 94105",
"456 Market St, Suite 200, San Francisco, CA 94105"
],
// Social Media (All 14 platforms - null if not found)
"linkedin": "https://linkedin.com/company/example",
"twitter": "https://twitter.com/example",
"facebook": "https://facebook.com/example",
"instagram": "https://instagram.com/example",
"youtube": "https://youtube.com/example",
"github": "https://github.com/example",
"tiktok": null,
"pinterest": null,
"medium": "https://medium.com/@example",
"discord": null,
"whatsapp": null,
"telegram": null,
"snapchat": null,
"reddit": null,
// Markdown Compendium
"markdown_compendium": "# Example Company\n\nWe provide enterprise solutions...",
"compendium": {
"chars": 8450,
"available": true,
"cleanup_level": "fit",
"storage_location": "inline"
},
// AI Features (all null - not enabled)
"company_vitals": null,
"pain_points": null,
"lead_scoring": null,
"team_members": [],
"personalization_hooks": null,
// Metadata
"metadata": {
"spa_enabled": true,
"sitemap_used": true,
"browser_rendering_available": true,
"crawl_strategy": "sitemap",
"total_emails_found": 3,
"total_phones_found": 2
}
},
"error_message": null
}
Understanding the Flat Structureā
Breaking change (v2.7.1): Responses are now flat (2-3 levels max) instead of deeply nested (5 levels).
Old Structure (Pre-v2.7.1)ā
{
"results": {
"results": {
"contact_info": {
"emails": [...],
"social_media": {
"linkedin": "...",
"twitter": "..."
}
}
}
}
}
New Structure (v2.7.1+)ā
{
"data": {
"emails": [...],
"linkedin": "...",
"twitter": "..."
}
}
Benefits:
- Easier integration (fewer levels to navigate)
- Consistent structure (all fields always present)
- Industry standard (similar to Firecrawl/Outscraper)
Accessing Contact Dataā
Python Exampleā
# Get the results
results = response.json()
data = results['data']
# Extract contact info
emails = data['emails']
phones = data['phones']
addresses = data['addresses']
# Extract social media (filter out nulls)
social_media = {
platform: url
for platform in ['linkedin', 'twitter', 'facebook', 'instagram',
'youtube', 'github', 'tiktok', 'pinterest',
'medium', 'discord', 'whatsapp', 'telegram',
'snapchat', 'reddit']
if (url := data.get(platform)) is not None
}
print(f"Found {len(emails)} emails: {emails}")
print(f"Found {len(phones)} phones: {phones}")
print(f"Found {len(social_media)} social profiles: {social_media}")
# Access markdown compendium
markdown = data.get('markdown_compendium')
if markdown:
print(f"Content preview: {markdown[:200]}...")
JavaScript Exampleā
// Get the results
const { data } = results;
// Extract contact info
const { emails, phones, addresses } = data;
// Extract social media (filter out nulls)
const socialPlatforms = [
'linkedin', 'twitter', 'facebook', 'instagram',
'youtube', 'github', 'tiktok', 'pinterest',
'medium', 'discord', 'whatsapp', 'telegram',
'snapchat', 'reddit'
];
const socialMedia = Object.fromEntries(
socialPlatforms
.map(platform => [platform, data[platform]])
.filter(([_, url]) => url !== null)
);
console.log(`Found ${emails.length} emails:`, emails);
console.log(`Found ${phones.length} phones:`, phones);
console.log(`Found ${Object.keys(socialMedia).length} social profiles:`, socialMedia);
// Access markdown compendium
if (data.markdown_compendium) {
console.log(`Content preview: ${data.markdown_compendium.substring(0, 200)}...`);
}
Customization Optionsā
1. Crawl More Pagesā
Default is 10 pages. Increase for larger sites:
{
"payload": {
"url": "https://example.com",
"max_pages": 25
}
}
Processing time: ~1.5 seconds per page on average
2. Target Specific Pagesā
Prioritize contact-related pages (works in 36+ languages):
{
"payload": {
"url": "https://example.com",
"max_pages": 15,
"target_pages": ["contact", "about", "team", "locations"]
}
}
Multilingual examples:
- German:
["kontakt", "über-uns", "team"] - Spanish:
["contacto", "acerca-de", "equipo"] - French:
["contact", "à -propos", "équipe"]
3. Optimize Compendium Sizeā
Control markdown size (affects token usage if feeding to LLMs):
- Minimal (30% size)
- Fit (60% size)
- Raw (100% size)
- Disabled
Best for LLM consumption - 70% token savings:
{
"payload": {
"url": "https://example.com",
"compendium": {
"cleanup_level": "minimal",
"max_chars": 50000
}
}
}
Default - removes nav/ads/footers:
{
"payload": {
"url": "https://example.com",
"compendium": {
"cleanup_level": "fit",
"max_chars": 100000
}
}
}
Complete fidelity - everything preserved:
{
"payload": {
"url": "https://example.com",
"compendium": {
"cleanup_level": "raw",
"max_chars": 200000
}
}
}
Contact extraction only - no markdown:
{
"payload": {
"url": "https://example.com",
"compendium": {
"enabled": false
}
}
}
4. Handle JavaScript-Heavy Sitesā
SpiderIQ automatically detects SPAs (React/Vue/Angular), but you can force it:
{
"payload": {
"url": "https://modern-spa-site.com",
"enable_spa": true,
"spa_timeout": 45
}
}
Auto-detection: SPA rendering is automatic in most cases. Only set enable_spa if you notice incomplete data.
Complete Working Exampleā
Here's a production-ready script that extracts contacts from multiple websites:
- Python - Bulk Extraction
- JavaScript - Bulk Extraction
import requests
import time
from typing import List, Dict
class SpiderIQClient:
def __init__(self, api_key: str):
self.api_key = api_key
self.base_url = "https://spideriq.ai/api/v1"
self.headers = {"Authorization": f"Bearer {api_key}"}
def submit_job(self, url: str, max_pages: int = 10) -> str:
"""Submit a contact extraction job"""
response = requests.post(
f"{self.base_url}/jobs/spiderSite/submit",
headers=self.headers,
json={
"payload": {
"url": url,
"max_pages": max_pages
}
}
)
response.raise_for_status()
return response.json()['job_id']
def get_results(self, job_id: str, max_wait: int = 120) -> Dict:
"""Poll for job results with timeout"""
url = f"{self.base_url}/jobs/{job_id}/results"
for _ in range(max_wait // 3):
response = requests.get(url, headers=self.headers)
if response.status_code == 200:
return response.json()
elif response.status_code == 202:
time.sleep(3)
elif response.status_code == 410:
error = response.json()
raise Exception(f"Job failed: {error['error_message']}")
else:
response.raise_for_status()
raise TimeoutError(f"Job {job_id} did not complete in {max_wait}s")
def extract_contacts(self, url: str, max_pages: int = 10) -> Dict:
"""Submit job and wait for results (one-shot)"""
job_id = self.submit_job(url, max_pages)
print(f"Processing {url}... (job: {job_id})")
return self.get_results(job_id)
# Usage: Extract contacts from multiple companies
client = SpiderIQClient("YOUR_API_KEY")
companies = [
"https://company1.com",
"https://company2.com",
"https://company3.com"
]
for url in companies:
try:
results = client.extract_contacts(url, max_pages=15)
data = results['data']
print(f"\nā {url}")
print(f" Emails: {data['emails']}")
print(f" Phones: {data['phones']}")
print(f" LinkedIn: {data.get('linkedin', 'Not found')}")
print(f" Pages crawled: {data['pages_crawled']}")
except Exception as e:
print(f"\nā {url}: {e}")
class SpiderIQClient {
constructor(apiKey) {
this.apiKey = apiKey;
this.baseUrl = 'https://spideriq.ai/api/v1';
this.headers = {
'Authorization': `Bearer ${apiKey}`,
'Content-Type': 'application/json'
};
}
async submitJob(url, maxPages = 10) {
const response = await fetch(
`${this.baseUrl}/jobs/spiderSite/submit`,
{
method: 'POST',
headers: this.headers,
body: JSON.stringify({
payload: { url, max_pages: maxPages }
})
}
);
if (!response.ok) throw new Error(`HTTP ${response.status}`);
const data = await response.json();
return data.job_id;
}
async getResults(jobId, maxWait = 120000) {
const url = `${this.baseUrl}/jobs/${jobId}/results`;
const startTime = Date.now();
while (Date.now() - startTime < maxWait) {
const response = await fetch(url, { headers: this.headers });
if (response.status === 200) {
return await response.json();
} else if (response.status === 202) {
await new Promise(resolve => setTimeout(resolve, 3000));
} else if (response.status === 410) {
const error = await response.json();
throw new Error(`Job failed: ${error.error_message}`);
} else {
throw new Error(`HTTP ${response.status}`);
}
}
throw new Error(`Job ${jobId} did not complete in ${maxWait}ms`);
}
async extractContacts(url, maxPages = 10) {
const jobId = await this.submitJob(url, maxPages);
console.log(`Processing ${url}... (job: ${jobId})`);
return await this.getResults(jobId);
}
}
// Usage: Extract contacts from multiple companies
const client = new SpiderIQClient('YOUR_API_KEY');
const companies = [
'https://company1.com',
'https://company2.com',
'https://company3.com'
];
for (const url of companies) {
try {
const results = await client.extractContacts(url, 15);
const { data } = results;
console.log(`\nā ${url}`);
console.log(` Emails: ${data.emails}`);
console.log(` Phones: ${data.phones}`);
console.log(` LinkedIn: ${data.linkedin || 'Not found'}`);
console.log(` Pages crawled: ${data.pages_crawled}`);
} catch (error) {
console.log(`\nā ${url}: ${error.message}`);
}
}
Email Filteringā
SpiderIQ automatically filters out tracking and garbage emails:
Filtered domains: sentry.io, wixpress.com, mailchimp.com, hubspot.com, google-analytics.com, and 20+ more tracking services
Example:
// Raw emails found:
[
"contact@example.com", // ā Real contact
"noreply@sentry.io", // ā Filtered (tracking)
"info@example.com", // ā Real contact
"auto@wixpress.com" // ā Filtered (tracking)
]
// Returned in response:
["contact@example.com", "info@example.com"]
See metadata.total_emails_found vs emails.length to see filtering impact.
Deduplication (24-Hour Cache)ā
SpiderIQ automatically deduplicates crawls within 24 hours:
{
"job_id": "abc-123",
"from_cache": false,
"message": "SpiderSite job queued successfully"
}
{
"job_id": "def-456",
"from_cache": true,
"status": "completed", // ā Instant response!
"message": "Job results retrieved from cache (original job: abc-123)"
}
Save time & money: If you accidentally submit the same URL twice, the second request returns instantly with cached results.
Processing Timeā
| Scenario | Estimated Time |
|---|---|
| Small site (5-10 pages) | 5-15 seconds |
| Medium site (10-20 pages) | 15-30 seconds |
| Large site (20-50 pages) | 30-60 seconds |
| SPA site (JavaScript-heavy) | +10-20 seconds |
Best Practicesā
Optimal max_pages setting
Start with 10-15 pages for most B2B websites. This typically covers:
- Homepage
- Contact page
- About page
- Team page
- Key product/service pages
Increase to 20-30 for:
- Large enterprises
- Multi-location businesses
- Companies with extensive blogs
Increase to 40-50 for:
- Complete site mapping
- Comprehensive competitor analysis
Target pages for contact extraction
Always include:
contact- Primary contact infoabout- Company overview + social linksteam- People and roles
Consider adding:
locations- Multi-office businessesleadership- Executive team infocareers- Team size indicatorspress/media- Press contact info
When to disable compendium
Disable compendium when:
- You ONLY need contact info (not content)
- Processing 1000+ URLs (save bandwidth)
- Integrating with CRM (structured data only)
Keep compendium when:
- Feeding content to LLMs
- Doing market research
- Analyzing company positioning
- Building knowledge bases
Handling failures
Common failure reasons:
- Connection timeout - Site is slow or blocking
- Robots.txt restriction - Site blocks crawlers
- CAPTCHA protection - Site requires human verification
- Invalid URL - URL is malformed or unreachable
Mitigation:
- Increase
timeoutto 60-90 seconds for slow sites - Increase
spa_timeoutto 60 seconds for heavy SPAs - Check
error_messagefield for specific failure reason - Verify URL is publicly accessible (not behind login)
Common Use Casesā
1. CRM Enrichmentā
Enrich existing lead database with contact info:
import pandas as pd
# Load leads from CSV
leads = pd.read_csv('leads.csv') # columns: company_name, website
client = SpiderIQClient("YOUR_API_KEY")
for idx, row in leads.iterrows():
try:
results = client.extract_contacts(row['website'], max_pages=15)
data = results['data']
# Update DataFrame
leads.at[idx, 'emails'] = ', '.join(data['emails'])
leads.at[idx, 'phones'] = ', '.join(data['phones'])
leads.at[idx, 'linkedin'] = data.get('linkedin')
except Exception as e:
print(f"Failed {row['website']}: {e}")
# Save enriched data
leads.to_csv('leads_enriched.csv', index=False)
2. Prospecting Workflowā
Find contact info for a list of target companies:
target_companies = [
"https://techcorp1.com",
"https://saas-company2.com",
"https://enterprise3.com"
]
contacts_db = []
for url in target_companies:
results = client.extract_contacts(url, max_pages=20)
data = results['data']
# Structure for export
contacts_db.append({
'company_url': url,
'emails': data['emails'],
'phones': data['phones'],
'linkedin': data.get('linkedin'),
'twitter': data.get('twitter'),
'pages_crawled': data['pages_crawled']
})
# Export to CSV/JSON
pd.DataFrame(contacts_db).to_csv('prospects.csv')
3. Competitor Monitoringā
Track competitor contact changes over time:
import json
from datetime import datetime
competitor = "https://competitor.com"
results = client.extract_contacts(competitor, max_pages=30)
# Save snapshot with timestamp
snapshot = {
'timestamp': datetime.now().isoformat(),
'url': competitor,
'data': results['data']
}
with open(f'competitor_snapshot_{datetime.now().strftime("%Y%m%d")}.json', 'w') as f:
json.dump(snapshot, f, indent=2)
Rate Limitsā
100 requests per minute per API key
For bulk processing, add rate limiting:
import time
def rate_limited_extract(client, urls, requests_per_minute=90):
"""Extract contacts with rate limiting"""
delay = 60.0 / requests_per_minute
for url in urls:
start = time.time()
try:
results = client.extract_contacts(url)
yield url, results
except Exception as e:
yield url, {'error': str(e)}
# Rate limiting delay
elapsed = time.time() - start
if elapsed < delay:
time.sleep(delay - elapsed)
# Usage
for url, results in rate_limited_extract(client, companies):
print(f"Processed: {url}")