Scrape and Analyze Airbnb Data with Firecrawl and HopX

Web scraping is messy. JavaScript-rendered pages, anti-bot measures, rate limits, and unpredictable HTML structures make it a constant battle.

Firecrawl solves the extraction problem—it handles JavaScript rendering, converts pages to clean markdown or structured data, and manages rate limits. But once you have the data, you need somewhere safe to process it.

This tutorial shows how to build a complete pipeline: scrape Airbnb listing data with Firecrawl, then analyze it in isolated HopX sandboxes.

Why This Architecture?

text

1	┌────────────────────────────────────────────────────────────────┐
2	│ Your Application │
3	└────────────────────────────────────────────────────────────────┘
4	│
5	┌──────────────────┴──────────────────┐
6	│ │
7	▼ ▼
8	┌───────────────────────┐ ┌───────────────────────────┐
9	│ Firecrawl │ │ HopX Sandbox │
10	│ │ │ │
11	│ • JS rendering │ │ • Secure code execution │
12	│ • Anti-bot bypass │ │ • Data processing │
13	│ • Clean extraction │ │ • Feature engineering │
14	│ • Rate limiting │ │ • Analysis & ML │
15	└───────────────────────┘ └───────────────────────────┘
16	│ │
17	└──────────────────┬──────────────────┘
18	│
19	▼
20	Processed, analyzed data
21

Why not process data locally?

Scraped data can contain malicious payloads
Dynamic code (eval-based transformations) needs isolation
Reproducible environments for consistent results
Scale processing without affecting your main system

Prerequisites

bash

1	pip install firecrawl-py hopx-ai pandas
2

Set your API keys:

bash

1	export FIRECRAWL_API_KEY="fc-..."
2	export HOPX_API_KEY="..."
3

Step 1: Scrape Airbnb Listings with Firecrawl

First, let's extract listing data. Firecrawl handles the JavaScript rendering and returns clean, structured content:

python

from firecrawl import FirecrawlApp
import json
 
# Initialize Firecrawl
firecrawl = FirecrawlApp(api_key="your-firecrawl-key")
 
def scrape_airbnb_search(location: str, checkin: str, checkout: str) -> dict:
    """Scrape Airbnb search results for a location."""
    
    # Build search URL
    url = f"https://www.airbnb.com/s/{location}/homes"
    params = f"?checkin={checkin}&checkout={checkout}"
    
    # Scrape with Firecrawl
    result = firecrawl.scrape_url(
        url + params,
        params={
            "formats": ["markdown", "extract"],
            "extract": {
                "schema": {
                    "type": "object",
                    "properties": {
                        "listings": {
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "title": {"type": "string"},
                                    "price_per_night": {"type": "string"},
                                    "rating": {"type": "string"},
                                    "reviews_count": {"type": "string"},
                                    "property_type": {"type": "string"},
                                    "bedrooms": {"type": "string"},
                                    "amenities": {"type": "array", "items": {"type": "string"}}
                                }
                            }
                        }
                    }
                }
            }
        }
    )
    
    return result
 
 
# Scrape multiple locations
locations = ["new-york", "san-francisco", "miami"]
all_listings = []
 
for location in locations:
    print(f"Scraping {location}...")
    data = scrape_airbnb_search(location, "2025-02-01", "2025-02-07")
    
    if data.get("extract", {}).get("listings"):
        for listing in data["extract"]["listings"]:
            listing["location"] = location
            all_listings.append(listing)
 
print(f"Scraped {len(all_listings)} listings")
 

Step 2: Process Data in HopX Sandbox

Now let's clean and transform the scraped data in an isolated environment:

python

from hopx import Sandbox
import json
 
def process_listings_in_sandbox(raw_listings: list) -> dict:
    """Clean and process listing data in isolated sandbox."""
    
    sandbox = Sandbox.create(template="code-interpreter")
    
    try:
        # Upload raw data to sandbox
        sandbox.files.write("/app/raw_listings.json", json.dumps(raw_listings))
        
        # Data cleaning and feature engineering code
        processing_code = '''
import pandas as pd
import json
import re
 
# Load raw data
with open("/app/raw_listings.json") as f:
    raw_data = json.load(f)
 
df = pd.DataFrame(raw_data)
 
# Clean price column
def clean_price(price_str):
    if not price_str:
        return None
    # Extract numeric value
    match = re.search(r"[\d,]+", str(price_str).replace(",", ""))
    return float(match.group()) if match else None
 
df["price_clean"] = df["price_per_night"].apply(clean_price)
 
# Clean rating
def clean_rating(rating_str):
    if not rating_str:
        return None
    match = re.search(r"(\d+\.?\d*)", str(rating_str))
    return float(match.group()) if match else None
 
df["rating_clean"] = df["rating"].apply(clean_rating)
 
# Clean reviews count
def clean_reviews(reviews_str):
    if not reviews_str:
        return 0
    match = re.search(r"(\d+)", str(reviews_str).replace(",", ""))
    return int(match.group()) if match else 0
 
df["reviews_clean"] = df["reviews_count"].apply(clean_reviews)
 
# Extract bedrooms as integer
def extract_bedrooms(bedroom_str):
    if not bedroom_str:
        return None
    match = re.search(r"(\d+)", str(bedroom_str))
    return int(match.group()) if match else None
 
df["bedrooms_clean"] = df["bedrooms"].apply(extract_bedrooms)
 
# Feature engineering
df["price_per_bedroom"] = df.apply(
    lambda x: x["price_clean"] / x["bedrooms_clean"] 
    if x["bedrooms_clean"] and x["bedrooms_clean"] > 0 
    else None, 
    axis=1
)
 
df["is_highly_rated"] = df["rating_clean"] >= 4.8
df["is_popular"] = df["reviews_clean"] >= 50
 
# Calculate value score (lower price + higher rating = better value)
df["value_score"] = df.apply(
    lambda x: (x["rating_clean"] or 0) / (x["price_clean"] or 1) * 100
    if x["price_clean"] and x["price_clean"] > 0
    else 0,
    axis=1
)
 
# Summary statistics
summary = {
    "total_listings": len(df),
    "avg_price": df["price_clean"].mean(),
    "avg_rating": df["rating_clean"].mean(),
    "price_by_location": df.groupby("location")["price_clean"].mean().to_dict(),
    "rating_by_location": df.groupby("location")["rating_clean"].mean().to_dict(),
    "top_value_listings": df.nlargest(5, "value_score")[
        ["title", "location", "price_clean", "rating_clean", "value_score"]
    ].to_dict("records")
}
 
# Save processed data
df.to_csv("/app/processed_listings.csv", index=False)
 
# Output summary
print(json.dumps(summary, indent=2, default=str))
'''
        
        # Execute processing
        result = sandbox.runCode(processing_code, language="python", timeout=60)
        
        if result.exitCode != 0:
            raise Exception(f"Processing failed: {result.stderr}")
        
        # Parse summary from stdout
        summary = json.loads(result.stdout)
        
        # Download processed CSV
        processed_csv = sandbox.files.read("/app/processed_listings.csv")
        
        return {
            "summary": summary,
            "processed_data": processed_csv
        }
        
    finally:
        sandbox.kill()
 
 
# Process the scraped data
results = process_listings_in_sandbox(all_listings)
print(json.dumps(results["summary"], indent=2))
 

Step 3: Advanced Analysis with Visualization

Generate insights and visualizations:

python

from hopx import Sandbox
import json
import base64
 
def analyze_and_visualize(processed_csv: str) -> dict:
    """Run advanced analysis and create visualizations."""
    
    sandbox = Sandbox.create(template="code-interpreter")
    
    try:
        # Upload processed data
        sandbox.files.write("/app/listings.csv", processed_csv)
        
        analysis_code = '''
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
 
# Set style
plt.style.use("seaborn-v0_8-whitegrid")
sns.set_palette("husl")
 
# Load data
df = pd.read_csv("/app/listings.csv")
 
# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
 
# 1. Price distribution by location
ax1 = axes[0, 0]
df.boxplot(column="price_clean", by="location", ax=ax1)
ax1.set_title("Price Distribution by Location")
ax1.set_xlabel("Location")
ax1.set_ylabel("Price per Night ($)")
plt.suptitle("")
 
# 2. Rating vs Price scatter
ax2 = axes[0, 1]
for location in df["location"].unique():
    loc_data = df[df["location"] == location]
    ax2.scatter(
        loc_data["price_clean"], 
        loc_data["rating_clean"], 
        label=location, 
        alpha=0.6
    )
ax2.set_xlabel("Price per Night ($)")
ax2.set_ylabel("Rating")
ax2.set_title("Price vs Rating by Location")
ax2.legend()
 
# 3. Average price by bedrooms
ax3 = axes[1, 0]
bedroom_prices = df.groupby("bedrooms_clean")["price_clean"].mean().dropna()
bedroom_prices.plot(kind="bar", ax=ax3, color="steelblue")
ax3.set_title("Average Price by Bedroom Count")
ax3.set_xlabel("Bedrooms")
ax3.set_ylabel("Average Price ($)")
ax3.tick_params(axis="x", rotation=0)
 
# 4. Value score distribution
ax4 = axes[1, 1]
df["value_score"].hist(bins=20, ax=ax4, color="coral", edgecolor="black")
ax4.set_title("Value Score Distribution")
ax4.set_xlabel("Value Score")
ax4.set_ylabel("Count")
ax4.axvline(df["value_score"].median(), color="red", linestyle="--", label="Median")
ax4.legend()
 
plt.tight_layout()
plt.savefig("/app/analysis.png", dpi=150, bbox_inches="tight")
plt.close()
 
# Statistical analysis
analysis = {
    "correlation_price_rating": df["price_clean"].corr(df["rating_clean"]),
    "price_stats": {
        "mean": df["price_clean"].mean(),
        "median": df["price_clean"].median(),
        "std": df["price_clean"].std(),
        "min": df["price_clean"].min(),
        "max": df["price_clean"].max()
    },
    "rating_stats": {
        "mean": df["rating_clean"].mean(),
        "median": df["rating_clean"].median(),
        "std": df["rating_clean"].std()
    },
    "best_value_by_location": df.loc[
        df.groupby("location")["value_score"].idxmax()
    ][["location", "title", "price_clean", "rating_clean", "value_score"]].to_dict("records"),
    "listings_above_4_5_rating": len(df[df["rating_clean"] >= 4.5]),
    "listings_under_100": len(df[df["price_clean"] < 100])
}
 
print(json.dumps(analysis, indent=2, default=str))
'''
        
        result = sandbox.runCode(analysis_code, language="python", timeout=120)
        
        if result.exitCode != 0:
            raise Exception(f"Analysis failed: {result.stderr}")
        
        analysis = json.loads(result.stdout)
        
        # Download visualization
        chart_bytes = sandbox.files.read("/app/analysis.png")
        chart_b64 = base64.b64encode(chart_bytes).decode()
        
        return {
            "analysis": analysis,
            "chart_base64": chart_b64
        }
        
    finally:
        sandbox.kill()
 
 
# Run analysis
viz_results = analyze_and_visualize(results["processed_data"])
print(json.dumps(viz_results["analysis"], indent=2))
 
# Save chart locally
import base64
with open("airbnb_analysis.png", "wb") as f:
    f.write(base64.b64decode(viz_results["chart_base64"]))
print("Chart saved to airbnb_analysis.png")
 

Step 4: Feature Engineering Pipeline

For machine learning tasks, create a reusable feature engineering pipeline:

python

from hopx import Sandbox
import json
 
def create_ml_features(listings_csv: str) -> str:
    """Create ML-ready features from listing data."""
    
    sandbox = Sandbox.create(template="code-interpreter")
    
    try:
        sandbox.files.write("/app/listings.csv", listings_csv)
        
        feature_code = '''
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import json
 
df = pd.read_csv("/app/listings.csv")
 
# ============================================
# Feature Engineering
# ============================================
 
# 1. Location encoding
le_location = LabelEncoder()
df["location_encoded"] = le_location.fit_transform(df["location"])
 
# 2. Property type encoding (if exists)
if "property_type" in df.columns:
    le_property = LabelEncoder()
    df["property_type_encoded"] = le_property.fit_transform(
        df["property_type"].fillna("Unknown")
    )
 
# 3. Amenities features (count and specific flags)
if "amenities" in df.columns:
    # Parse amenities if stored as string
    def parse_amenities(x):
        if pd.isna(x):
            return []
        if isinstance(x, str):
            try:
                return json.loads(x.replace("'", '"'))
            except:
                return x.split(",")
        return x
    
    df["amenities_parsed"] = df["amenities"].apply(parse_amenities)
    df["amenity_count"] = df["amenities_parsed"].apply(len)
    
    # Flag specific high-value amenities
    high_value_amenities = ["wifi", "pool", "parking", "kitchen", "washer", "ac", "air conditioning"]
    for amenity in high_value_amenities:
        df[f"has_{amenity.replace(' ', '_')}"] = df["amenities_parsed"].apply(
            lambda x: any(amenity.lower() in str(a).lower() for a in x) if x else False
        ).astype(int)
 
# 4. Price features
df["price_log"] = np.log1p(df["price_clean"])
df["price_squared"] = df["price_clean"] ** 2
 
# Price relative to location average
location_avg_price = df.groupby("location")["price_clean"].transform("mean")
df["price_vs_location_avg"] = df["price_clean"] / location_avg_price
 
# 5. Rating features
df["rating_missing"] = df["rating_clean"].isna().astype(int)
df["rating_filled"] = df["rating_clean"].fillna(df["rating_clean"].median())
 
# High performer flags
df["is_superhost_quality"] = (
    (df["rating_clean"] >= 4.8) & 
    (df["reviews_clean"] >= 10)
).astype(int)
 
# 6. Review features
df["reviews_log"] = np.log1p(df["reviews_clean"])
df["has_reviews"] = (df["reviews_clean"] > 0).astype(int)
 
# Reviews relative to location
location_avg_reviews = df.groupby("location")["reviews_clean"].transform("mean")
df["reviews_vs_location_avg"] = df["reviews_clean"] / (location_avg_reviews + 1)
 
# 7. Bedroom features
df["bedrooms_filled"] = df["bedrooms_clean"].fillna(1)
df["is_studio"] = (df["bedrooms_clean"] == 0).astype(int)
df["is_large"] = (df["bedrooms_clean"] >= 3).astype(int)
 
# 8. Interaction features
df["price_rating_interaction"] = df["price_clean"] * df["rating_filled"]
df["bedrooms_price_interaction"] = df["bedrooms_filled"] * df["price_clean"]
 
# 9. Normalized features
scaler = StandardScaler()
numeric_cols = ["price_clean", "rating_filled", "reviews_clean", "bedrooms_filled"]
for col in numeric_cols:
    if col in df.columns:
        df[f"{col}_normalized"] = scaler.fit_transform(df[[col]].fillna(0))
 
# Select final feature columns
feature_columns = [
    "location_encoded",
    "price_log", "price_squared", "price_vs_location_avg",
    "rating_filled", "rating_missing",
    "reviews_log", "has_reviews", "reviews_vs_location_avg",
    "bedrooms_filled", "is_studio", "is_large",
    "price_rating_interaction", "bedrooms_price_interaction",
    "is_superhost_quality",
    "amenity_count"
] + [col for col in df.columns if col.startswith("has_")]
 
# Keep only features that exist
feature_columns = [col for col in feature_columns if col in df.columns]
 
# Create feature matrix
features_df = df[feature_columns + ["title", "location", "price_clean"]]
 
# Save
features_df.to_csv("/app/ml_features.csv", index=False)
 
# Summary
summary = {
    "total_features": len(feature_columns),
    "feature_names": feature_columns,
    "sample_size": len(features_df),
    "missing_values": features_df[feature_columns].isna().sum().to_dict()
}
 
print(json.dumps(summary, indent=2))
'''
        
        result = sandbox.runCode(feature_code, language="python", timeout=120)
        
        if result.exitCode != 0:
            raise Exception(f"Feature engineering failed: {result.stderr}")
        
        print("Feature engineering complete:")
        print(result.stdout)
        
        # Return the feature CSV
        return sandbox.files.read("/app/ml_features.csv")
        
    finally:
        sandbox.kill()
 
 
# Create ML features
ml_features = create_ml_features(results["processed_data"])
 

Step 5: Complete Pipeline Class

Here's a production-ready pipeline class:

python

from firecrawl import FirecrawlApp
from hopx import Sandbox
from typing import List, Dict, Optional
import json
import time
 
class AirbnbScrapingPipeline:
    """Complete pipeline for scraping and analyzing Airbnb data."""
    
    def __init__(self, firecrawl_key: str, hopx_key: str):
        self.firecrawl = FirecrawlApp(api_key=firecrawl_key)
        self.hopx_key = hopx_key
        self.sandbox: Optional[Sandbox] = None
    
    def scrape_location(
        self, 
        location: str, 
        checkin: str, 
        checkout: str,
        max_pages: int = 1
    ) -> List[Dict]:
        """Scrape listings for a location."""
        
        listings = []
        
        for page in range(max_pages):
            url = f"https://www.airbnb.com/s/{location}/homes"
            params = f"?checkin={checkin}&checkout={checkout}"
            if page > 0:
                params += f"&items_offset={page * 20}"
            
            try:
                result = self.firecrawl.scrape_url(
                    url + params,
                    params={
                        "formats": ["extract"],
                        "extract": {
                            "schema": {
                                "type": "object",
                                "properties": {
                                    "listings": {
                                        "type": "array",
                                        "items": {
                                            "type": "object",
                                            "properties": {
                                                "title": {"type": "string"},
                                                "price_per_night": {"type": "string"},
                                                "rating": {"type": "string"},
                                                "reviews_count": {"type": "string"},
                                                "property_type": {"type": "string"},
                                                "bedrooms": {"type": "string"},
                                                "amenities": {
                                                    "type": "array",
                                                    "items": {"type": "string"}
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        },
                        "waitFor": 3000  # Wait for JS rendering
                    }
                )
                
                page_listings = result.get("extract", {}).get("listings", [])
                for listing in page_listings:
                    listing["location"] = location
                    listing["scraped_at"] = time.strftime("%Y-%m-%d %H:%M:%S")
                listings.extend(page_listings)
                
                print(f"  Page {page + 1}: {len(page_listings)} listings")
                
                # Rate limiting
                time.sleep(2)
                
            except Exception as e:
                print(f"  Error on page {page + 1}: {e}")
                continue
        
        return listings
    
    def get_sandbox(self) -> Sandbox:
        """Get or create persistent sandbox."""
        if self.sandbox is None:
            self.sandbox = Sandbox.create(
                template="code-interpreter",
                ttl=600  # 10 minute TTL
            )
        return self.sandbox
    
    def process_data(self, raw_listings: List[Dict]) -> Dict:
        """Clean and process raw listings."""
        
        sandbox = self.get_sandbox()
        sandbox.files.write(
            "/app/raw_data.json", 
            json.dumps(raw_listings)
        )
        
        code = '''
import pandas as pd
import json
import re
 
with open("/app/raw_data.json") as f:
    data = json.load(f)
 
df = pd.DataFrame(data)
 
# Cleaning functions
def clean_price(x):
    if not x: return None
    m = re.search(r"[\d,]+", str(x).replace(",", ""))
    return float(m.group()) if m else None
 
def clean_rating(x):
    if not x: return None
    m = re.search(r"(\d+\.?\d*)", str(x))
    return float(m.group()) if m else None
 
def clean_int(x):
    if not x: return None
    m = re.search(r"(\d+)", str(x).replace(",", ""))
    return int(m.group()) if m else None
 
# Apply cleaning
df["price"] = df["price_per_night"].apply(clean_price)
df["rating"] = df["rating"].apply(clean_rating)
df["reviews"] = df["reviews_count"].apply(clean_int)
df["bedrooms"] = df["bedrooms"].apply(clean_int)
 
# Derived features
df["value_score"] = df.apply(
    lambda x: (x["rating"] or 0) / (x["price"] or 1) * 100
    if x["price"] and x["price"] > 0 else 0,
    axis=1
)
 
# Save processed
df.to_csv("/app/processed.csv", index=False)
 
# Stats
stats = {
    "count": len(df),
    "locations": df["location"].nunique(),
    "avg_price": round(df["price"].mean(), 2),
    "avg_rating": round(df["rating"].mean(), 2),
    "price_range": [df["price"].min(), df["price"].max()]
}
print(json.dumps(stats))
'''
        
        result = sandbox.runCode(code, language="python", timeout=60)
        
        if result.exitCode != 0:
            raise Exception(f"Processing failed: {result.stderr}")
        
        return json.loads(result.stdout)
    
    def analyze(self) -> Dict:
        """Run analysis on processed data."""
        
        sandbox = self.get_sandbox()
        
        code = '''
import pandas as pd
import json
 
df = pd.read_csv("/app/processed.csv")
 
analysis = {
    "by_location": df.groupby("location").agg({
        "price": ["mean", "median", "count"],
        "rating": "mean"
    }).round(2).to_dict(),
    
    "top_value": df.nlargest(10, "value_score")[
        ["title", "location", "price", "rating", "value_score"]
    ].to_dict("records"),
    
    "price_segments": {
        "budget": len(df[df["price"] < 100]),
        "mid": len(df[(df["price"] >= 100) & (df["price"] < 250)]),
        "premium": len(df[df["price"] >= 250])
    },
    
    "correlations": {
        "price_rating": round(df["price"].corr(df["rating"]), 3),
        "price_reviews": round(df["price"].corr(df["reviews"]), 3)
    }
}
 
print(json.dumps(analysis, indent=2, default=str))
'''
        
        result = sandbox.runCode(code, language="python", timeout=60)
        
        if result.exitCode != 0:
            raise Exception(f"Analysis failed: {result.stderr}")
        
        return json.loads(result.stdout)
    
    def export_csv(self) -> str:
        """Export processed data as CSV."""
        sandbox = self.get_sandbox()
        return sandbox.files.read("/app/processed.csv")
    
    def cleanup(self):
        """Destroy sandbox."""
        if self.sandbox:
            self.sandbox.kill()
            self.sandbox = None
 
 
# Usage
if __name__ == "__main__":
    import os
    
    pipeline = AirbnbScrapingPipeline(
        firecrawl_key=os.environ["FIRECRAWL_API_KEY"],
        hopx_key=os.environ["HOPX_API_KEY"]
    )
    
    try:
        # Scrape multiple locations
        all_listings = []
        for location in ["new-york", "los-angeles", "chicago"]:
            print(f"Scraping {location}...")
            listings = pipeline.scrape_location(
                location, 
                checkin="2025-03-01", 
                checkout="2025-03-07"
            )
            all_listings.extend(listings)
        
        print(f"\nTotal listings: {len(all_listings)}")
        
        # Process
        print("\nProcessing data...")
        stats = pipeline.process_data(all_listings)
        print(f"Stats: {stats}")
        
        # Analyze
        print("\nAnalyzing...")
        analysis = pipeline.analyze()
        print(json.dumps(analysis, indent=2))
        
        # Export
        csv_data = pipeline.export_csv()
        with open("airbnb_listings.csv", "w") as f:
            f.write(csv_data)
        print("\nExported to airbnb_listings.csv")
        
    finally:
        pipeline.cleanup()
 

Alternative: Using Crawl4AI

If you prefer open-source tools, Crawl4AI is a great alternative:

python

from crawl4ai import AsyncWebCrawler
from hopx import Sandbox
import asyncio
import json
 
async def scrape_with_crawl4ai(urls: list) -> list:
    """Scrape URLs using Crawl4AI."""
    
    results = []
    
    async with AsyncWebCrawler(verbose=True) as crawler:
        for url in urls:
            result = await crawler.arun(
                url=url,
                extraction_strategy="LLMExtractionStrategy",
                extraction_config={
                    "schema": {
                        "listings": [{
                            "title": "string",
                            "price": "string",
                            "rating": "string"
                        }]
                    }
                }
            )
            
            if result.success:
                results.append(result.extracted_content)
    
    return results
 
 
# Run scraping
urls = [
    "https://www.airbnb.com/s/new-york/homes",
    "https://www.airbnb.com/s/miami/homes"
]
 
scraped = asyncio.run(scrape_with_crawl4ai(urls))
 

Error Handling and Retry Logic

Production pipelines need robust error handling:

python

import time
from typing import Callable, Any
 
def with_retry(
    func: Callable, 
    max_retries: int = 3, 
    delay: float = 2.0,
    backoff: float = 2.0
) -> Any:
    """Execute function with exponential backoff retry."""
    
    last_error = None
    current_delay = delay
    
    for attempt in range(max_retries):
        try:
            return func()
        except Exception as e:
            last_error = e
            print(f"Attempt {attempt + 1} failed: {e}")
            
            if attempt < max_retries - 1:
                print(f"Retrying in {current_delay}s...")
                time.sleep(current_delay)
                current_delay *= backoff
    
    raise last_error
 
 
# Usage
result = with_retry(
    lambda: firecrawl.scrape_url(url, params=params),
    max_retries=3,
    delay=2.0
)
 

Best Practices

1. Respect Rate Limits

python

import time
 
class RateLimiter:
    def __init__(self, requests_per_minute: int):
        self.min_interval = 60.0 / requests_per_minute
        self.last_request = 0
    
    def wait(self):
        elapsed = time.time() - self.last_request
        if elapsed < self.min_interval:
            time.sleep(self.min_interval - elapsed)
        self.last_request = time.time()
 
limiter = RateLimiter(requests_per_minute=10)
 

2. Cache Scraped Data

python

import hashlib
import json
import os
 
def get_cached_or_scrape(url: str, scrape_func: Callable) -> dict:
    """Return cached data or scrape fresh."""
    
    cache_key = hashlib.md5(url.encode()).hexdigest()
    cache_file = f"/tmp/scrape_cache/{cache_key}.json"
    
    if os.path.exists(cache_file):
        with open(cache_file) as f:
            return json.load(f)
    
    data = scrape_func(url)
    
    os.makedirs("/tmp/scrape_cache", exist_ok=True)
    with open(cache_file, "w") as f:
        json.dump(data, f)
    
    return data
 

3. Validate Data Quality

python

def validate_listings(listings: list) -> tuple:
    """Validate and filter listings."""
    
    valid = []
    invalid = []
    
    for listing in listings:
        # Must have title and price
        if not listing.get("title") or not listing.get("price_per_night"):
            invalid.append(listing)
            continue
        
        valid.append(listing)
    
    print(f"Valid: {len(valid)}, Invalid: {len(invalid)}")
    return valid, invalid
 

Conclusion

Combining Firecrawl and HopX gives you a powerful, safe web scraping pipeline:

Firecrawl handles the messy parts: JavaScript rendering, anti-bot measures, rate limits
HopX provides secure, isolated environments for data processing
Together they enable production-grade data pipelines without security risks

This pattern works for any scraping task—real estate, e-commerce, job listings, or any data you need to extract and analyze.

Ready to build your own data pipeline? Get started with HopX for secure data processing.

Scrape and Analyze Airbnb Data with Firecrawl and HopX

Scrape and Analyze Airbnb Data with Firecrawl and HopX

Why This Architecture?

Prerequisites

Step 1: Scrape Airbnb Listings with Firecrawl

Step 2: Process Data in HopX Sandbox

Step 3: Advanced Analysis with Visualization

Step 4: Feature Engineering Pipeline

Step 5: Complete Pipeline Class

Alternative: Using Crawl4AI

Error Handling and Retry Logic

Best Practices

1. Respect Rate Limits

2. Cache Scraped Data

3. Validate Data Quality

Conclusion

Further Reading

Related articles

Microsoft Agent Framework with HopX: Secure Code Execution for AI Agents

Microsoft AutoGen with Isolated Code Execution Using HopX

CrewAI Multi-Agent Pipelines with Secure Code Execution

1	from firecrawl import FirecrawlApp
2	import json
3
4	# Initialize Firecrawl
5	firecrawl = FirecrawlApp(api_key="your-firecrawl-key")
6
7	def scrape_airbnb_search(location: str, checkin: str, checkout: str) -> dict:
8	"""Scrape Airbnb search results for a location."""
9
10	# Build search URL
11	url = f"https://www.airbnb.com/s/{location}/homes"
12	params = f"?checkin={checkin}&checkout={checkout}"
13
14	# Scrape with Firecrawl
15	result = firecrawl.scrape_url(
16	url + params,
17	params={
18	"formats": ["markdown", "extract"],
19	"extract": {
20	"schema": {
21	"type": "object",
22	"properties": {
23	"listings": {
24	"type": "array",
25	"items": {
26	"type": "object",
27	"properties": {
28	"title": {"type": "string"},
29	"price_per_night": {"type": "string"},
30	"rating": {"type": "string"},
31	"reviews_count": {"type": "string"},
32	"property_type": {"type": "string"},
33	"bedrooms": {"type": "string"},
34	"amenities": {"type": "array", "items": {"type": "string"}}
35	}
36	}
37	}
38	}
39	}
40	}
41	}
42	)
43
44	return result
45
46
47	# Scrape multiple locations
48	locations = ["new-york", "san-francisco", "miami"]
49	all_listings = []
50
51	for location in locations:
52	print(f"Scraping {location}...")
53	data = scrape_airbnb_search(location, "2025-02-01", "2025-02-07")
54
55	if data.get("extract", {}).get("listings"):
56	for listing in data["extract"]["listings"]:
57	listing["location"] = location
58	all_listings.append(listing)
59
60	print(f"Scraped {len(all_listings)} listings")
61

1	from hopx import Sandbox
2	import json
3
4	def process_listings_in_sandbox(raw_listings: list) -> dict:
5	"""Clean and process listing data in isolated sandbox."""
6
7	sandbox = Sandbox.create(template="code-interpreter")
8
9	try:
10	# Upload raw data to sandbox
11	sandbox.files.write("/app/raw_listings.json", json.dumps(raw_listings))
12
13	# Data cleaning and feature engineering code
14	processing_code = '''
15	import pandas as pd
16	import json
17	import re
18
19	# Load raw data
20	with open("/app/raw_listings.json") as f:
21	raw_data = json.load(f)
22
23	df = pd.DataFrame(raw_data)
24
25	# Clean price column
26	def clean_price(price_str):
27	if not price_str:
28	return None
29	# Extract numeric value
30	match = re.search(r"[\d,]+", str(price_str).replace(",", ""))
31	return float(match.group()) if match else None
32
33	df["price_clean"] = df["price_per_night"].apply(clean_price)
34
35	# Clean rating
36	def clean_rating(rating_str):
37	if not rating_str:
38	return None
39	match = re.search(r"(\d+\.?\d*)", str(rating_str))
40	return float(match.group()) if match else None
41
42	df["rating_clean"] = df["rating"].apply(clean_rating)
43
44	# Clean reviews count
45	def clean_reviews(reviews_str):
46	if not reviews_str:
47	return 0
48	match = re.search(r"(\d+)", str(reviews_str).replace(",", ""))
49	return int(match.group()) if match else 0
50
51	df["reviews_clean"] = df["reviews_count"].apply(clean_reviews)
52
53	# Extract bedrooms as integer
54	def extract_bedrooms(bedroom_str):
55	if not bedroom_str:
56	return None
57	match = re.search(r"(\d+)", str(bedroom_str))
58	return int(match.group()) if match else None
59
60	df["bedrooms_clean"] = df["bedrooms"].apply(extract_bedrooms)
61
62	# Feature engineering
63	df["price_per_bedroom"] = df.apply(
64	lambda x: x["price_clean"] / x["bedrooms_clean"]
65	if x["bedrooms_clean"] and x["bedrooms_clean"] > 0
66	else None,
67	axis=1
68	)
69
70	df["is_highly_rated"] = df["rating_clean"] >= 4.8
71	df["is_popular"] = df["reviews_clean"] >= 50
72
73	# Calculate value score (lower price + higher rating = better value)
74	df["value_score"] = df.apply(
75	lambda x: (x["rating_clean"] or 0) / (x["price_clean"] or 1) * 100
76	if x["price_clean"] and x["price_clean"] > 0
77	else 0,
78	axis=1
79	)
80
81	# Summary statistics
82	summary = {
83	"total_listings": len(df),
84	"avg_price": df["price_clean"].mean(),
85	"avg_rating": df["rating_clean"].mean(),
86	"price_by_location": df.groupby("location")["price_clean"].mean().to_dict(),
87	"rating_by_location": df.groupby("location")["rating_clean"].mean().to_dict(),
88	"top_value_listings": df.nlargest(5, "value_score")[
89	["title", "location", "price_clean", "rating_clean", "value_score"]
90	].to_dict("records")
91	}
92
93	# Save processed data
94	df.to_csv("/app/processed_listings.csv", index=False)
95
96	# Output summary
97	print(json.dumps(summary, indent=2, default=str))
98	'''
99
100	# Execute processing
101	result = sandbox.runCode(processing_code, language="python", timeout=60)
102
103	if result.exitCode != 0:
104	raise Exception(f"Processing failed: {result.stderr}")
105
106	# Parse summary from stdout
107	summary = json.loads(result.stdout)
108
109	# Download processed CSV
110	processed_csv = sandbox.files.read("/app/processed_listings.csv")
111
112	return {
113	"summary": summary,
114	"processed_data": processed_csv
115	}
116
117	finally:
118	sandbox.kill()
119
120
121	# Process the scraped data
122	results = process_listings_in_sandbox(all_listings)
123	print(json.dumps(results["summary"], indent=2))
124

1	from hopx import Sandbox
2	import json
3	import base64
4
5	def analyze_and_visualize(processed_csv: str) -> dict:
6	"""Run advanced analysis and create visualizations."""
7
8	sandbox = Sandbox.create(template="code-interpreter")
9
10	try:
11	# Upload processed data
12	sandbox.files.write("/app/listings.csv", processed_csv)
13
14	analysis_code = '''
15	import pandas as pd
16	import matplotlib.pyplot as plt
17	import seaborn as sns
18	import json
19
20	# Set style
21	plt.style.use("seaborn-v0_8-whitegrid")
22	sns.set_palette("husl")
23
24	# Load data
25	df = pd.read_csv("/app/listings.csv")
26
27	# Create figure with subplots
28	fig, axes = plt.subplots(2, 2, figsize=(14, 10))
29
30	# 1. Price distribution by location
31	ax1 = axes[0, 0]
32	df.boxplot(column="price_clean", by="location", ax=ax1)
33	ax1.set_title("Price Distribution by Location")
34	ax1.set_xlabel("Location")
35	ax1.set_ylabel("Price per Night ($)")
36	plt.suptitle("")
37
38	# 2. Rating vs Price scatter
39	ax2 = axes[0, 1]
40	for location in df["location"].unique():
41	loc_data = df[df["location"] == location]
42	ax2.scatter(
43	loc_data["price_clean"],
44	loc_data["rating_clean"],
45	label=location,
46	alpha=0.6
47	)
48	ax2.set_xlabel("Price per Night ($)")
49	ax2.set_ylabel("Rating")
50	ax2.set_title("Price vs Rating by Location")
51	ax2.legend()
52
53	# 3. Average price by bedrooms
54	ax3 = axes[1, 0]
55	bedroom_prices = df.groupby("bedrooms_clean")["price_clean"].mean().dropna()
56	bedroom_prices.plot(kind="bar", ax=ax3, color="steelblue")
57	ax3.set_title("Average Price by Bedroom Count")
58	ax3.set_xlabel("Bedrooms")
59	ax3.set_ylabel("Average Price ($)")
60	ax3.tick_params(axis="x", rotation=0)
61
62	# 4. Value score distribution
63	ax4 = axes[1, 1]
64	df["value_score"].hist(bins=20, ax=ax4, color="coral", edgecolor="black")
65	ax4.set_title("Value Score Distribution")
66	ax4.set_xlabel("Value Score")
67	ax4.set_ylabel("Count")
68	ax4.axvline(df["value_score"].median(), color="red", linestyle="--", label="Median")
69	ax4.legend()
70
71	plt.tight_layout()
72	plt.savefig("/app/analysis.png", dpi=150, bbox_inches="tight")
73	plt.close()
74
75	# Statistical analysis
76	analysis = {
77	"correlation_price_rating": df["price_clean"].corr(df["rating_clean"]),
78	"price_stats": {
79	"mean": df["price_clean"].mean(),
80	"median": df["price_clean"].median(),
81	"std": df["price_clean"].std(),
82	"min": df["price_clean"].min(),
83	"max": df["price_clean"].max()
84	},
85	"rating_stats": {
86	"mean": df["rating_clean"].mean(),
87	"median": df["rating_clean"].median(),
88	"std": df["rating_clean"].std()
89	},
90	"best_value_by_location": df.loc[
91	df.groupby("location")["value_score"].idxmax()
92	][["location", "title", "price_clean", "rating_clean", "value_score"]].to_dict("records"),
93	"listings_above_4_5_rating": len(df[df["rating_clean"] >= 4.5]),
94	"listings_under_100": len(df[df["price_clean"] < 100])
95	}
96
97	print(json.dumps(analysis, indent=2, default=str))
98	'''
99
100	result = sandbox.runCode(analysis_code, language="python", timeout=120)
101
102	if result.exitCode != 0:
103	raise Exception(f"Analysis failed: {result.stderr}")
104
105	analysis = json.loads(result.stdout)
106
107	# Download visualization
108	chart_bytes = sandbox.files.read("/app/analysis.png")
109	chart_b64 = base64.b64encode(chart_bytes).decode()
110
111	return {
112	"analysis": analysis,
113	"chart_base64": chart_b64
114	}
115
116	finally:
117	sandbox.kill()
118
119
120	# Run analysis
121	viz_results = analyze_and_visualize(results["processed_data"])
122	print(json.dumps(viz_results["analysis"], indent=2))
123
124	# Save chart locally
125	import base64
126	with open("airbnb_analysis.png", "wb") as f:
127	f.write(base64.b64decode(viz_results["chart_base64"]))
128	print("Chart saved to airbnb_analysis.png")
129

1	from hopx import Sandbox
2	import json
3
4	def create_ml_features(listings_csv: str) -> str:
5	"""Create ML-ready features from listing data."""
6
7	sandbox = Sandbox.create(template="code-interpreter")
8
9	try:
10	sandbox.files.write("/app/listings.csv", listings_csv)
11
12	feature_code = '''
13	import pandas as pd
14	import numpy as np
15	from sklearn.preprocessing import LabelEncoder, StandardScaler
16	import json
17
18	df = pd.read_csv("/app/listings.csv")
19
20	# ============================================
21	# Feature Engineering
22	# ============================================
23
24	# 1. Location encoding
25	le_location = LabelEncoder()
26	df["location_encoded"] = le_location.fit_transform(df["location"])
27
28	# 2. Property type encoding (if exists)
29	if "property_type" in df.columns:
30	le_property = LabelEncoder()
31	df["property_type_encoded"] = le_property.fit_transform(
32	df["property_type"].fillna("Unknown")
33	)
34
35	# 3. Amenities features (count and specific flags)
36	if "amenities" in df.columns:
37	# Parse amenities if stored as string
38	def parse_amenities(x):
39	if pd.isna(x):
40	return []
41	if isinstance(x, str):
42	try:
43	return json.loads(x.replace("'", '"'))
44	except:
45	return x.split(",")
46	return x
47
48	df["amenities_parsed"] = df["amenities"].apply(parse_amenities)
49	df["amenity_count"] = df["amenities_parsed"].apply(len)
50
51	# Flag specific high-value amenities
52	high_value_amenities = ["wifi", "pool", "parking", "kitchen", "washer", "ac", "air conditioning"]
53	for amenity in high_value_amenities:
54	df[f"has_{amenity.replace(' ', '_')}"] = df["amenities_parsed"].apply(
55	lambda x: any(amenity.lower() in str(a).lower() for a in x) if x else False
56	).astype(int)
57
58	# 4. Price features
59	df["price_log"] = np.log1p(df["price_clean"])
60	df["price_squared"] = df["price_clean"] ** 2
61
62	# Price relative to location average
63	location_avg_price = df.groupby("location")["price_clean"].transform("mean")
64	df["price_vs_location_avg"] = df["price_clean"] / location_avg_price
65
66	# 5. Rating features
67	df["rating_missing"] = df["rating_clean"].isna().astype(int)
68	df["rating_filled"] = df["rating_clean"].fillna(df["rating_clean"].median())
69
70	# High performer flags
71	df["is_superhost_quality"] = (
72	(df["rating_clean"] >= 4.8) &
73	(df["reviews_clean"] >= 10)
74	).astype(int)
75
76	# 6. Review features
77	df["reviews_log"] = np.log1p(df["reviews_clean"])
78	df["has_reviews"] = (df["reviews_clean"] > 0).astype(int)
79
80	# Reviews relative to location
81	location_avg_reviews = df.groupby("location")["reviews_clean"].transform("mean")
82	df["reviews_vs_location_avg"] = df["reviews_clean"] / (location_avg_reviews + 1)
83
84	# 7. Bedroom features
85	df["bedrooms_filled"] = df["bedrooms_clean"].fillna(1)
86	df["is_studio"] = (df["bedrooms_clean"] == 0).astype(int)
87	df["is_large"] = (df["bedrooms_clean"] >= 3).astype(int)
88
89	# 8. Interaction features
90	df["price_rating_interaction"] = df["price_clean"] * df["rating_filled"]
91	df["bedrooms_price_interaction"] = df["bedrooms_filled"] * df["price_clean"]
92
93	# 9. Normalized features
94	scaler = StandardScaler()
95	numeric_cols = ["price_clean", "rating_filled", "reviews_clean", "bedrooms_filled"]
96	for col in numeric_cols:
97	if col in df.columns:
98	df[f"{col}_normalized"] = scaler.fit_transform(df[[col]].fillna(0))
99
100	# Select final feature columns
101	feature_columns = [
102	"location_encoded",
103	"price_log", "price_squared", "price_vs_location_avg",
104	"rating_filled", "rating_missing",
105	"reviews_log", "has_reviews", "reviews_vs_location_avg",
106	"bedrooms_filled", "is_studio", "is_large",
107	"price_rating_interaction", "bedrooms_price_interaction",
108	"is_superhost_quality",
109	"amenity_count"
110	] + [col for col in df.columns if col.startswith("has_")]
111
112	# Keep only features that exist
113	feature_columns = [col for col in feature_columns if col in df.columns]
114
115	# Create feature matrix
116	features_df = df[feature_columns + ["title", "location", "price_clean"]]
117
118	# Save
119	features_df.to_csv("/app/ml_features.csv", index=False)
120
121	# Summary
122	summary = {
123	"total_features": len(feature_columns),
124	"feature_names": feature_columns,
125	"sample_size": len(features_df),
126	"missing_values": features_df[feature_columns].isna().sum().to_dict()
127	}
128
129	print(json.dumps(summary, indent=2))
130	'''
131
132	result = sandbox.runCode(feature_code, language="python", timeout=120)
133
134	if result.exitCode != 0:
135	raise Exception(f"Feature engineering failed: {result.stderr}")
136
137	print("Feature engineering complete:")
138	print(result.stdout)
139
140	# Return the feature CSV
141	return sandbox.files.read("/app/ml_features.csv")
142
143	finally:
144	sandbox.kill()
145
146
147	# Create ML features
148	ml_features = create_ml_features(results["processed_data"])
149

1	from firecrawl import FirecrawlApp
2	from hopx import Sandbox
3	from typing import List, Dict, Optional
4	import json
5	import time
6
7	class AirbnbScrapingPipeline:
8	"""Complete pipeline for scraping and analyzing Airbnb data."""
9
10	def __init__(self, firecrawl_key: str, hopx_key: str):
11	self.firecrawl = FirecrawlApp(api_key=firecrawl_key)
12	self.hopx_key = hopx_key
13	self.sandbox: Optional[Sandbox] = None
14
15	def scrape_location(
16	self,
17	location: str,
18	checkin: str,
19	checkout: str,
20	max_pages: int = 1
21	) -> List[Dict]:
22	"""Scrape listings for a location."""
23
24	listings = []
25
26	for page in range(max_pages):
27	url = f"https://www.airbnb.com/s/{location}/homes"
28	params = f"?checkin={checkin}&checkout={checkout}"
29	if page > 0:
30	params += f"&items_offset={page * 20}"
31
32	try:
33	result = self.firecrawl.scrape_url(
34	url + params,
35	params={
36	"formats": ["extract"],
37	"extract": {
38	"schema": {
39	"type": "object",
40	"properties": {
41	"listings": {
42	"type": "array",
43	"items": {
44	"type": "object",
45	"properties": {
46	"title": {"type": "string"},
47	"price_per_night": {"type": "string"},
48	"rating": {"type": "string"},
49	"reviews_count": {"type": "string"},
50	"property_type": {"type": "string"},
51	"bedrooms": {"type": "string"},
52	"amenities": {
53	"type": "array",
54	"items": {"type": "string"}
55	}
56	}
57	}
58	}
59	}
60	}
61	},
62	"waitFor": 3000 # Wait for JS rendering
63	}
64	)
65
66	page_listings = result.get("extract", {}).get("listings", [])
67	for listing in page_listings:
68	listing["location"] = location
69	listing["scraped_at"] = time.strftime("%Y-%m-%d %H:%M:%S")
70	listings.extend(page_listings)
71
72	print(f" Page {page + 1}: {len(page_listings)} listings")
73
74	# Rate limiting
75	time.sleep(2)
76
77	except Exception as e:
78	print(f" Error on page {page + 1}: {e}")
79	continue
80
81	return listings
82
83	def get_sandbox(self) -> Sandbox:
84	"""Get or create persistent sandbox."""
85	if self.sandbox is None:
86	self.sandbox = Sandbox.create(
87	template="code-interpreter",
88	ttl=600 # 10 minute TTL
89	)
90	return self.sandbox
91
92	def process_data(self, raw_listings: List[Dict]) -> Dict:
93	"""Clean and process raw listings."""
94
95	sandbox = self.get_sandbox()
96	sandbox.files.write(
97	"/app/raw_data.json",
98	json.dumps(raw_listings)
99	)
100
101	code = '''
102	import pandas as pd
103	import json
104	import re
105
106	with open("/app/raw_data.json") as f:
107	data = json.load(f)
108
109	df = pd.DataFrame(data)
110
111	# Cleaning functions
112	def clean_price(x):
113	if not x: return None
114	m = re.search(r"[\d,]+", str(x).replace(",", ""))
115	return float(m.group()) if m else None
116
117	def clean_rating(x):
118	if not x: return None
119	m = re.search(r"(\d+\.?\d*)", str(x))
120	return float(m.group()) if m else None
121
122	def clean_int(x):
123	if not x: return None
124	m = re.search(r"(\d+)", str(x).replace(",", ""))
125	return int(m.group()) if m else None
126
127	# Apply cleaning
128	df["price"] = df["price_per_night"].apply(clean_price)
129	df["rating"] = df["rating"].apply(clean_rating)
130	df["reviews"] = df["reviews_count"].apply(clean_int)
131	df["bedrooms"] = df["bedrooms"].apply(clean_int)
132
133	# Derived features
134	df["value_score"] = df.apply(
135	lambda x: (x["rating"] or 0) / (x["price"] or 1) * 100
136	if x["price"] and x["price"] > 0 else 0,
137	axis=1
138	)
139
140	# Save processed
141	df.to_csv("/app/processed.csv", index=False)
142
143	# Stats
144	stats = {
145	"count": len(df),
146	"locations": df["location"].nunique(),
147	"avg_price": round(df["price"].mean(), 2),
148	"avg_rating": round(df["rating"].mean(), 2),
149	"price_range": [df["price"].min(), df["price"].max()]
150	}
151	print(json.dumps(stats))
152	'''
153
154	result = sandbox.runCode(code, language="python", timeout=60)
155
156	if result.exitCode != 0:
157	raise Exception(f"Processing failed: {result.stderr}")
158
159	return json.loads(result.stdout)
160
161	def analyze(self) -> Dict:
162	"""Run analysis on processed data."""
163
164	sandbox = self.get_sandbox()
165
166	code = '''
167	import pandas as pd
168	import json
169
170	df = pd.read_csv("/app/processed.csv")
171
172	analysis = {
173	"by_location": df.groupby("location").agg({
174	"price": ["mean", "median", "count"],
175	"rating": "mean"
176	}).round(2).to_dict(),
177
178	"top_value": df.nlargest(10, "value_score")[
179	["title", "location", "price", "rating", "value_score"]
180	].to_dict("records"),
181
182	"price_segments": {
183	"budget": len(df[df["price"] < 100]),
184	"mid": len(df[(df["price"] >= 100) & (df["price"] < 250)]),
185	"premium": len(df[df["price"] >= 250])
186	},
187
188	"correlations": {
189	"price_rating": round(df["price"].corr(df["rating"]), 3),
190	"price_reviews": round(df["price"].corr(df["reviews"]), 3)
191	}
192	}
193
194	print(json.dumps(analysis, indent=2, default=str))
195	'''
196
197	result = sandbox.runCode(code, language="python", timeout=60)
198
199	if result.exitCode != 0:
200	raise Exception(f"Analysis failed: {result.stderr}")
201
202	return json.loads(result.stdout)
203
204	def export_csv(self) -> str:
205	"""Export processed data as CSV."""
206	sandbox = self.get_sandbox()
207	return sandbox.files.read("/app/processed.csv")
208
209	def cleanup(self):
210	"""Destroy sandbox."""
211	if self.sandbox:
212	self.sandbox.kill()
213	self.sandbox = None
214
215
216	# Usage
217	if __name__ == "__main__":
218	import os
219
220	pipeline = AirbnbScrapingPipeline(
221	firecrawl_key=os.environ["FIRECRAWL_API_KEY"],
222	hopx_key=os.environ["HOPX_API_KEY"]
223	)
224
225	try:
226	# Scrape multiple locations
227	all_listings = []
228	for location in ["new-york", "los-angeles", "chicago"]:
229	print(f"Scraping {location}...")
230	listings = pipeline.scrape_location(
231	location,
232	checkin="2025-03-01",
233	checkout="2025-03-07"
234	)
235	all_listings.extend(listings)
236
237	print(f"\nTotal listings: {len(all_listings)}")
238
239	# Process
240	print("\nProcessing data...")
241	stats = pipeline.process_data(all_listings)
242	print(f"Stats: {stats}")
243
244	# Analyze
245	print("\nAnalyzing...")
246	analysis = pipeline.analyze()
247	print(json.dumps(analysis, indent=2))
248
249	# Export
250	csv_data = pipeline.export_csv()
251	with open("airbnb_listings.csv", "w") as f:
252	f.write(csv_data)
253	print("\nExported to airbnb_listings.csv")
254
255	finally:
256	pipeline.cleanup()
257

1	from crawl4ai import AsyncWebCrawler
2	from hopx import Sandbox
3	import asyncio
4	import json
5
6	async def scrape_with_crawl4ai(urls: list) -> list:
7	"""Scrape URLs using Crawl4AI."""
8
9	results = []
10
11	async with AsyncWebCrawler(verbose=True) as crawler:
12	for url in urls:
13	result = await crawler.arun(
14	url=url,
15	extraction_strategy="LLMExtractionStrategy",
16	extraction_config={
17	"schema": {
18	"listings": [{
19	"title": "string",
20	"price": "string",
21	"rating": "string"
22	}]
23	}
24	}
25	)
26
27	if result.success:
28	results.append(result.extracted_content)
29
30	return results
31
32
33	# Run scraping
34	urls = [
35	"https://www.airbnb.com/s/new-york/homes",
36	"https://www.airbnb.com/s/miami/homes"
37	]
38
39	scraped = asyncio.run(scrape_with_crawl4ai(urls))
40

1	import time
2	from typing import Callable, Any
3
4	def with_retry(
5	func: Callable,
6	max_retries: int = 3,
7	delay: float = 2.0,
8	backoff: float = 2.0
9	) -> Any:
10	"""Execute function with exponential backoff retry."""
11
12	last_error = None
13	current_delay = delay
14
15	for attempt in range(max_retries):
16	try:
17	return func()
18	except Exception as e:
19	last_error = e
20	print(f"Attempt {attempt + 1} failed: {e}")
21
22	if attempt < max_retries - 1:
23	print(f"Retrying in {current_delay}s...")
24	time.sleep(current_delay)
25	current_delay *= backoff
26
27	raise last_error
28
29
30	# Usage
31	result = with_retry(
32	lambda: firecrawl.scrape_url(url, params=params),
33	max_retries=3,
34	delay=2.0
35	)
36

1	import time
2
3	class RateLimiter:
4	def __init__(self, requests_per_minute: int):
5	self.min_interval = 60.0 / requests_per_minute
6	self.last_request = 0
7
8	def wait(self):
9	elapsed = time.time() - self.last_request
10	if elapsed < self.min_interval:
11	time.sleep(self.min_interval - elapsed)
12	self.last_request = time.time()
13
14	limiter = RateLimiter(requests_per_minute=10)
15

1	import hashlib
2	import json
3	import os
4
5	def get_cached_or_scrape(url: str, scrape_func: Callable) -> dict:
6	"""Return cached data or scrape fresh."""
7
8	cache_key = hashlib.md5(url.encode()).hexdigest()
9	cache_file = f"/tmp/scrape_cache/{cache_key}.json"
10
11	if os.path.exists(cache_file):
12	with open(cache_file) as f:
13	return json.load(f)
14
15	data = scrape_func(url)
16
17	os.makedirs("/tmp/scrape_cache", exist_ok=True)
18	with open(cache_file, "w") as f:
19	json.dump(data, f)
20
21	return data
22

1	def validate_listings(listings: list) -> tuple:
2	"""Validate and filter listings."""
3
4	valid = []
5	invalid = []
6
7	for listing in listings:
8	# Must have title and price
9	if not listing.get("title") or not listing.get("price_per_night"):
10	invalid.append(listing)
11	continue
12
13	valid.append(listing)
14
15	print(f"Valid: {len(valid)}, Invalid: {len(invalid)}")
16	return valid, invalid
17