Tutoriels

Créez un agrégateur d'offres d'emploi avec CaptchaAI

Récupérez les offres d'emploi de plusieurs tableaux, gérez les défis CAPTCHA avec CaptchaAI, normalisez les données et stockez-les à des fins de recherche et d'analyse.


Architecture

[Job Board A] ──┐
[Job Board B] ──┼──> Scraper + CAPTCHA Solver ──> Normalizer ──> SQLite DB
[Job Board C] ──┘

Modèle de données de travail

# models.py
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
import sqlite3
import json


@dataclass
class JobListing:
    title: str
    company: str
    location: str
    url: str
    source: str
    salary_min: Optional[float] = None
    salary_max: Optional[float] = None
    posted_date: Optional[str] = None
    description: str = ""
    tags: list = field(default_factory=list)
    scraped_at: str = field(default_factory=lambda: datetime.now().isoformat())


class JobDatabase:
    def __init__(self, db_path="jobs.db"):
        self.conn = sqlite3.connect(db_path)
        self._create_table()

    def _create_table(self):
        self.conn.execute("""
            CREATE TABLE IF NOT EXISTS jobs (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                title TEXT NOT NULL,
                company TEXT NOT NULL,
                location TEXT,
                url TEXT UNIQUE,
                source TEXT,
                salary_min REAL,
                salary_max REAL,
                posted_date TEXT,
                description TEXT,
                tags TEXT,
                scraped_at TEXT
            )
        """)
        self.conn.commit()

    def insert(self, job: JobListing):
        try:
            self.conn.execute(
                """INSERT OR IGNORE INTO jobs
                   (title, company, location, url, source,
                    salary_min, salary_max, posted_date,
                    description, tags, scraped_at)
                   VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
                (job.title, job.company, job.location, job.url,
                 job.source, job.salary_min, job.salary_max,
                 job.posted_date, job.description,
                 json.dumps(job.tags), job.scraped_at),
            )
            self.conn.commit()
        except sqlite3.IntegrityError:
            pass  # Duplicate URL

    def search(self, keyword, location=None):
        query = "SELECT * FROM jobs WHERE title LIKE ?"
        params = [f"%{keyword}%"]
        if location:
            query += " AND location LIKE ?"
            params.append(f"%{location}%")
        query += " ORDER BY scraped_at DESC"
        cursor = self.conn.execute(query, params)
        return cursor.fetchall()

Base de grattoir sensible aux CAPTCHA

# scraper_base.py
import requests
import re
import time
import os


class BaseScraper:
    API_KEY = os.environ["CAPTCHAAI_API_KEY"]

    def __init__(self, source_name):
        self.source = source_name
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                          "AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
        })

    def fetch(self, url):
        resp = self.session.get(url, timeout=20)

        if self._has_captcha(resp.text):
            token = self._solve_captcha(url, resp.text)
            resp = self.session.post(url, data={
                "g-recaptcha-response": token,
            }, timeout=30)

        return resp.text

    def _has_captcha(self, html):
        return "data-sitekey" in html or "g-recaptcha" in html

    def _solve_captcha(self, url, html):
        match = re.search(r'data-sitekey="([^"]+)"', html)
        if not match:
            raise ValueError("No sitekey found")

        sitekey = match.group(1)

        resp = requests.post("https://ocr.captchaai.com/in.php", data={
            "key": self.API_KEY,
            "method": "userrecaptcha",
            "googlekey": sitekey,
            "pageurl": url,
            "json": 1,
        }, timeout=30)
        task_id = resp.json()["request"]
        time.sleep(15)

        for _ in range(24):
            resp = requests.get("https://ocr.captchaai.com/res.php", params={
                "key": self.API_KEY, "action": "get",
                "id": task_id, "json": 1,
            }, timeout=15)
            data = resp.json()
            if data.get("status") == 1:
                return data["request"]
            if data["request"] != "CAPCHA_NOT_READY":
                raise RuntimeError(data["request"])
            time.sleep(5)

        raise TimeoutError("CAPTCHA solve timeout")

Grattoir pour le site d'emploi

# scrapers.py
from bs4 import BeautifulSoup
from scraper_base import BaseScraper
from models import JobListing
import re


class GenericJobScraper(BaseScraper):
    """Scrape a job board search results page."""

    def __init__(self, source_name, base_url, selectors):
        super().__init__(source_name)
        self.base_url = base_url
        self.selectors = selectors

    def scrape_search(self, keyword, location="", max_pages=3):
        jobs = []

        for page in range(1, max_pages + 1):
            url = self.base_url.format(
                keyword=keyword.replace(" ", "+"),
                location=location.replace(" ", "+"),
                page=page,
            )
            html = self.fetch(url)
            page_jobs = self._parse_listings(html)

            if not page_jobs:
                break
            jobs.extend(page_jobs)

        return jobs

    def _parse_listings(self, html):
        soup = BeautifulSoup(html, "html.parser")
        cards = soup.select(self.selectors["card"])
        jobs = []

        for card in cards:
            title_el = card.select_one(self.selectors["title"])
            company_el = card.select_one(self.selectors["company"])
            location_el = card.select_one(self.selectors.get("location", ".location"))
            link_el = card.select_one(self.selectors.get("link", "a"))

            if not title_el or not company_el:
                continue

            salary = self._extract_salary(card.get_text())

            jobs.append(JobListing(
                title=title_el.get_text(strip=True),
                company=company_el.get_text(strip=True),
                location=location_el.get_text(strip=True) if location_el else "",
                url=link_el["href"] if link_el else "",
                source=self.source,
                salary_min=salary[0],
                salary_max=salary[1],
            ))

        return jobs

    def _extract_salary(self, text):
        match = re.search(
            r'\$?([\d,]+)\s*[-–to]+\s*\$?([\d,]+)', text
        )
        if match:
            return (
                float(match.group(1).replace(",", "")),
                float(match.group(2).replace(",", "")),
            )
        return (None, None)

Coureur

# main.py
import time
from models import JobDatabase
from scrapers import GenericJobScraper

BOARDS = [
    {
        "name": "Board A",
        "base_url": "https://board-a.example.com/search?q={keyword}&l={location}&p={page}",
        "selectors": {
            "card": ".job-card",
            "title": ".job-title",
            "company": ".company-name",
            "location": ".job-location",
            "link": "a.job-link",
        },
    },
]


def main():
    db = JobDatabase()
    keywords = ["python developer", "data engineer"]

    for board in BOARDS:
        scraper = GenericJobScraper(board["name"], board["base_url"], board["selectors"])

        for keyword in keywords:
            print(f"Scraping {board['name']} for '{keyword}'...")
            jobs = scraper.scrape_search(keyword, location="Remote")

            for job in jobs:
                db.insert(job)
                print(f"  {job.title} at {job.company}")

            time.sleep(5)

    # Search example
    results = db.search("python", "Remote")
    print(f"\nFound {len(results)} matching jobs")


if __name__ == "__main__":
    main()

Dépannage

Problème Parce que Corriger
Annonces en double Même travail sur plusieurs pages Déduplication basée sur l'URL via la contrainte UNIQUE
L’extraction des salaires échoue Format non standard Personnaliser l'expression régulière _extract_salary par tableau
CAPTCHA sur chaque page Session non persistante Réutiliser self.session entre les requêtes
Annonces vides après résolution Le formulaire CAPTCHA nécessite JS Passer au Sélénium + CaptchaAI

FAQ

Comment gérer la pagination ?

Le grattoir parcourt les pages 1 jusqu'à max_pages. Si une page ne renvoie aucune carte de travail, elle s'arrête prématurément.

Puis-je ajouter facilement de nouveaux sites d'emploi ?

Oui. Ajoutez une nouvelle entrée à BOARDS avec le modèle d'URL du forum et les sélecteurs CSS.

Comment éviter d'être bloqué ?

Limitez le débit des requêtes avec time.sleep(), faites tourner les agents utilisateurs et maintenez les sessions cohérentes.


Guides connexes


Données d'emploi agrégées -commencer par CaptchaAI.

Les commentaires sont désactivés pour cet article.