Facebook Business Page Scraping
Overviewβ
SpiderFacebookPage extracts business information from Facebook pages. Submit a Facebook page URL and receive structured data including contact details, follower counts, ratings, and profile pictures hosted on SpiderMedia.
Single URL Per Job: Unlike SpiderMaps (which returns 100+ businesses per search), SpiderFacebookPage processes one Facebook page at a time. This is ideal for enriching existing leads with Facebook data.
What You Can Extractβ
Each Facebook page provides:
Contact Information
- Business email
- Phone number
- Physical address
- Website URL
Social Proof
- Follower count
- Page likes
- Rating & reviews
- Business category
Business Details
- Operating hours
- Price range
- Services offered
- Business type
Media
- Profile picture (hosted on SpiderMedia)
- Linked social accounts
- Facebook page ID
Quick Startβ
Submit a Jobβ
- Python
- cURL
- JavaScript
import requests
import time
API = "https://spideriq.ai/api/v1"
TOKEN = "<your_token>"
headers = {"Authorization": f"Bearer {TOKEN}"}
# Submit job
response = requests.post(
f"{API}/jobs/spiderFacebookPage/submit",
headers=headers,
json={
"payload": {
"url": "https://www.facebook.com/CocaCola"
}
}
)
job = response.json()
print(f"Job submitted: {job['job_id']}")
curl -X POST https://spideriq.ai/api/v1/jobs/spiderFacebookPage/submit \
-H "Authorization: Bearer <your_token>" \
-H "Content-Type: application/json" \
-d '{
"payload": {
"url": "https://www.facebook.com/CocaCola"
}
}'
const response = await fetch(
'https://spideriq.ai/api/v1/jobs/spiderFacebookPage/submit',
{
method: 'POST',
headers: {
'Authorization': 'Bearer <your_token>',
'Content-Type': 'application/json'
},
body: JSON.stringify({
payload: {
url: 'https://www.facebook.com/CocaCola'
}
})
}
);
const job = await response.json();
console.log('Job submitted:', job.job_id);
Retrieve Resultsβ
- Python
- JavaScript
# Poll for results
while True:
response = requests.get(
f"{API}/jobs/{job['job_id']}/results",
headers=headers
)
if response.status_code == 200:
result = response.json()
data = result['data']['data']
print(f"Name: {data['name']}")
print(f"Followers: {data.get('followers', 'N/A')}")
print(f"Phone: {data.get('phone', 'N/A')}")
print(f"Category: {data.get('category', 'N/A')}")
print(f"Profile Picture: {data.get('profile_picture_stored', 'N/A')}")
break
elif response.status_code == 202:
print("Processing...")
time.sleep(5)
else:
print(f"Error: {response.status_code}")
break
const pollResults = async (jobId) => {
while (true) {
const response = await fetch(
`https://spideriq.ai/api/v1/jobs/${jobId}/results`,
{ headers: { 'Authorization': 'Bearer <your_token>' } }
);
if (response.status === 200) {
const result = await response.json();
const data = result.data.data;
console.log('Name:', data.name);
console.log('Followers:', data.followers || 'N/A');
console.log('Phone:', data.phone || 'N/A');
console.log('Profile Picture:', data.profile_picture_stored || 'N/A');
break;
} else if (response.status === 202) {
console.log('Processing...');
await new Promise(r => setTimeout(r, 5000));
} else {
console.error('Error:', response.status);
break;
}
}
};
Results Structureβ
{
"success": true,
"job_id": "ef89341d-1a55-401f-8eb0-ebce5b565f0a",
"type": "spiderFacebookPage",
"status": "completed",
"processing_time_seconds": 18.5,
"data": {
"data": {
"name": "Coca-Cola",
"facebook_id": "40796308305",
"facebook_url": "https://www.facebook.com/CocaCola",
"profile_picture_url": "https://scontent.xx.fbcdn.net/...",
"profile_picture_stored": "https://media.spideriq.ai/client-cli_xxx/facebook-profiles/40796308305.png",
"email": null,
"phone": null,
"address": null,
"website": "https://coca-cola.com",
"category": "Page Β· Food & Beverage Company",
"followers": "107M followers",
"likes": null,
"rating": null,
"business_hours": null,
"price_range": null,
"services": null,
"social_accounts": null,
"is_business_page": true
},
"metadata": {
"url": "https://www.facebook.com/CocaCola",
"proxy_used": true,
"profile_picture_stored": true
}
}
}
Common Use Casesβ
1. Enriching Google Maps Leadsβ
Chain SpiderMaps with SpiderFacebookPage to get both Google Maps data and Facebook presence:
import requests
API = "https://spideriq.ai/api/v1"
headers = {"Authorization": "Bearer <your_token>"}
# Step 1: Get businesses from Google Maps
maps_job = requests.post(
f"{API}/jobs/spiderMaps/submit",
headers=headers,
json={
"payload": {
"search_query": "restaurants Berlin, Germany",
"max_results": 20
}
}
).json()
# Wait for results...
maps_results = get_results(maps_job['job_id'])
# Step 2: Enrich with Facebook data where available
for business in maps_results['data']['businesses']:
# Many businesses have Facebook in their website links
website = business.get('website', '')
if 'facebook.com' in website:
fb_job = requests.post(
f"{API}/jobs/spiderFacebookPage/submit",
headers=headers,
json={"payload": {"url": website}}
).json()
fb_data = get_results(fb_job['job_id'])
business['facebook_data'] = fb_data['data']['data']
2. Bulk Facebook Page Extractionβ
Process multiple Facebook URLs efficiently:
import requests
import time
from concurrent.futures import ThreadPoolExecutor
facebook_urls = [
"https://www.facebook.com/McDonalds",
"https://www.facebook.com/Starbucks",
"https://www.facebook.com/Nike",
"https://www.facebook.com/CocaCola",
]
def scrape_facebook_page(url):
"""Submit job and wait for results."""
job = requests.post(
f"{API}/jobs/spiderFacebookPage/submit",
headers=headers,
json={"payload": {"url": url}}
).json()
# Poll for results
for _ in range(30): # Max 2.5 minutes
response = requests.get(
f"{API}/jobs/{job['job_id']}/results",
headers=headers
)
if response.status_code == 200:
return response.json()['data']['data']
time.sleep(5)
return None
# Process in parallel (max 5 concurrent)
with ThreadPoolExecutor(max_workers=5) as executor:
results = list(executor.map(scrape_facebook_page, facebook_urls))
# Print summary
for url, data in zip(facebook_urls, results):
if data:
print(f"{data['name']}: {data.get('followers', 'N/A')} followers")
else:
print(f"{url}: Failed")
3. Competitor Analysisβ
Compare Facebook metrics across competitors:
competitors = [
"https://www.facebook.com/McDonalds",
"https://www.facebook.com/BurgerKing",
"https://www.facebook.com/Wendys",
"https://www.facebook.com/Subway",
]
def parse_followers(followers_str):
"""Convert '82M followers' to number."""
if not followers_str:
return 0
followers_str = followers_str.replace(' followers', '').strip()
if 'M' in followers_str:
return float(followers_str.replace('M', '')) * 1_000_000
elif 'K' in followers_str:
return float(followers_str.replace('K', '')) * 1_000
return float(followers_str)
# Collect data
competitor_data = []
for url in competitors:
data = scrape_facebook_page(url)
if data:
competitor_data.append({
'name': data['name'],
'followers': parse_followers(data.get('followers')),
'followers_display': data.get('followers', 'N/A'),
'rating': data.get('rating', 'N/A'),
'category': data.get('category', 'N/A'),
'has_phone': bool(data.get('phone')),
'has_website': bool(data.get('website')),
})
# Sort by followers
competitor_data.sort(key=lambda x: x['followers'], reverse=True)
# Print report
print("Competitor Facebook Analysis")
print("=" * 50)
for comp in competitor_data:
print(f"\n{comp['name']}")
print(f" Followers: {comp['followers_display']}")
print(f" Rating: {comp['rating']}")
print(f" Category: {comp['category']}")
4. Profile Picture Collectionβ
Collect and host profile pictures for a directory:
facebook_pages = [
"https://www.facebook.com/Nike",
"https://www.facebook.com/Adidas",
"https://www.facebook.com/Puma",
]
profile_pictures = {}
for url in facebook_pages:
data = scrape_facebook_page(url)
if data and data.get('profile_picture_stored'):
profile_pictures[data['name']] = {
'hosted_url': data['profile_picture_stored'], # Permanent URL
'original_url': data.get('profile_picture_url'), # May expire
'facebook_id': data.get('facebook_id'),
}
# Use hosted URLs in your application
for name, pics in profile_pictures.items():
print(f"{name}: {pics['hosted_url']}")
SpiderMedia Integrationβ
Profile pictures are automatically uploaded to your client's SpiderMedia bucket, providing:
- Permanent URLs: Won't expire like Facebook CDN URLs
- Consistent format:
https://media.spideriq.ai/client-{client_id}/facebook-profiles/{facebook_id}.{ext} - No external dependencies: Images hosted on SpiderIQ infrastructure
Image Storage: Always use profile_picture_stored for long-term storage. The profile_picture_url points to Facebook's CDN and may expire or change.
URL Formats Supportedβ
SpiderFacebookPage accepts various Facebook URL formats:
| Format | Example |
|---|---|
| Page username | https://www.facebook.com/McDonalds |
| With www | https://www.facebook.com/CocaCola |
| Without www | https://facebook.com/Nike |
| Profile ID | https://www.facebook.com/profile.php?id=123456 |
| Mobile | https://m.facebook.com/pagename |
Handling Failuresβ
Private/Restricted Pagesβ
Some pages cannot be scraped due to privacy settings:
result = requests.get(f"{API}/jobs/{job_id}/results", headers=headers)
data = result.json()
if data.get('status') == 'failed':
error = data.get('error', '')
if 'Library exited with code 1' in error:
print("Page is private or restricted")
else:
print(f"Failed: {error}")
Common Failure Causes:
- Personal profiles (not business pages)
- Pages with restricted visibility
- Newly created pages
- Region-restricted content
- Temporarily unavailable pages
Retry Strategyβ
def scrape_with_retry(url, max_retries=3):
"""Scrape with retry on failure."""
for attempt in range(max_retries):
result = scrape_facebook_page(url)
if result:
return result
print(f"Attempt {attempt + 1} failed, retrying...")
time.sleep(10) # Wait before retry
return None
Rate Limitingβ
Recommended Rate: Submit no more than 10 Facebook page jobs per minute to avoid potential blocking.
import time
urls = [...] # Your list of Facebook URLs
for i, url in enumerate(urls):
job = requests.post(
f"{API}/jobs/spiderFacebookPage/submit",
headers=headers,
json={"payload": {"url": url}}
)
# Rate limit: max 10 per minute
if (i + 1) % 10 == 0:
print("Rate limit pause...")
time.sleep(60)
Processing Timeβ
| Scenario | Typical Time |
|---|---|
| Standard public page | 10-20 seconds |
| Large page (millions of followers) | 15-25 seconds |
| With SpiderMedia upload | +5-10 seconds |
| Proxy fallback needed | +15-20 seconds |
Best Practicesβ
Use Business Pages: SpiderFacebookPage works best with Facebook Business Pages. Personal profiles often have privacy restrictions that prevent scraping.
Validate URLs First: Before submitting many jobs, verify your URLs are valid Facebook page URLs to avoid wasting API calls.
Cache Results: Store Facebook page data with the facebook_id as a unique key to avoid re-scraping the same pages.
Handle Missing Data: Not all pages have all fields. Always check if a field exists before using it.
Combining with Other Workersβ
SpiderMaps β SpiderFacebookPageβ
# Find businesses on Google Maps, then enrich with Facebook
maps_job = submit_maps_job("coffee shops Berlin")
businesses = get_results(maps_job)
for biz in businesses['data']['businesses']:
# Look for Facebook link in website
website = biz.get('website', '')
if 'facebook.com' in website.lower():
fb_data = scrape_facebook_page(website)
if fb_data:
biz['facebook_followers'] = fb_data.get('followers')
biz['facebook_rating'] = fb_data.get('rating')
SpiderSite β SpiderFacebookPageβ
# Scrape website, find Facebook link, then get Facebook data
site_job = submit_site_job("https://example.com")
site_data = get_results(site_job)
# Extract Facebook link from social profiles
facebook_url = site_data['data'].get('facebook')
if facebook_url:
fb_data = scrape_facebook_page(facebook_url)