Tutoriels API

Résolution de CAPTCHA d'images par lots : traitement de plus de 1 000 images

Lorsque vous devez résoudre des centaines ou des milliers de CAPTCHA d’images, le traitement séquentiel est trop lent. Ce guide montre comment créer un pipeline de traitement par lots qui soumet, interroge et collecte simultanément les résultats de plus de 1 000 images à l'aide de CaptchaAI.


Architecture

[Image Queue] → [Submit Workers] → [Poll Workers] → [Results Store]
     ↓                ↓                  ↓                ↓
  1000 images    20 concurrent      Adaptive poll     CSV/JSON output
                   submits           intervals

Python : processeur par lots asynchrone

import asyncio
import aiohttp
import base64
import json
import time
import csv
from pathlib import Path

API_KEY = "YOUR_API_KEY"
SUBMIT_URL = "https://ocr.captchaai.com/in.php"
RESULT_URL = "https://ocr.captchaai.com/res.php"
MAX_CONCURRENT_SUBMITS = 20
MAX_CONCURRENT_POLLS = 30
POLL_INTERVAL = 5


async def submit_image(session, sem, image_path):
    """Submit a single image CAPTCHA."""
    async with sem:
        with open(image_path, "rb") as f:
            img_b64 = base64.b64encode(f.read()).decode()

        data = {
            "key": API_KEY,
            "method": "base64",
            "body": img_b64,
            "json": "1",
        }

        async with session.post(SUBMIT_URL, data=data) as resp:
            result = await resp.json()

        if result["status"] != 1:
            return {"file": str(image_path), "error": result["request"]}

        return {
            "file": str(image_path),
            "task_id": result["request"],
            "submitted_at": time.time(),
        }


async def poll_result(session, sem, task):
    """Poll for a single task result."""
    async with sem:
        for attempt in range(24):
            await asyncio.sleep(POLL_INTERVAL)

            params = {
                "key": API_KEY,
                "action": "get",
                "id": task["task_id"],
                "json": "1",
            }

            async with session.get(RESULT_URL, params=params) as resp:
                result = await resp.json()

            if result["status"] == 1:
                return {
                    "file": task["file"],
                    "task_id": task["task_id"],
                    "answer": result["request"],
                    "solve_time": time.time() - task["submitted_at"],
                }
            if result["request"] != "CAPCHA_NOT_READY":
                return {
                    "file": task["file"],
                    "task_id": task["task_id"],
                    "error": result["request"],
                }

        return {
            "file": task["file"],
            "task_id": task["task_id"],
            "error": "TIMEOUT",
        }


async def process_batch(image_dir, output_file="results.csv"):
    """Process all images in a directory."""
    image_paths = sorted(Path(image_dir).glob("*.png")) + \
                  sorted(Path(image_dir).glob("*.jpg"))

    print(f"Found {len(image_paths)} images")

    submit_sem = asyncio.Semaphore(MAX_CONCURRENT_SUBMITS)
    poll_sem = asyncio.Semaphore(MAX_CONCURRENT_POLLS)

    async with aiohttp.ClientSession() as session:
        # Phase 1: Submit all images
        print("Submitting...")
        submit_tasks = [
            submit_image(session, submit_sem, path)
            for path in image_paths
        ]
        submissions = await asyncio.gather(*submit_tasks)

        # Separate successes and errors
        pending = [s for s in submissions if "task_id" in s]
        errors = [s for s in submissions if "error" in s]
        print(f"Submitted: {len(pending)}, Errors: {len(errors)}")

        # Phase 2: Poll all pending tasks
        print("Polling for results...")
        poll_tasks = [
            poll_result(session, poll_sem, task)
            for task in pending
        ]
        results = await asyncio.gather(*poll_tasks)

    # Combine results
    all_results = results + errors

    # Write to CSV
    with open(output_file, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=[
            "file", "task_id", "answer", "solve_time", "error"
        ])
        writer.writeheader()
        for r in all_results:
            writer.writerow({
                "file": r.get("file", ""),
                "task_id": r.get("task_id", ""),
                "answer": r.get("answer", ""),
                "solve_time": round(r.get("solve_time", 0), 2),
                "error": r.get("error", ""),
            })

    solved = sum(1 for r in results if "answer" in r)
    failed = sum(1 for r in results if "error" in r)
    print(f"Done: {solved} solved, {failed} failed, {len(errors)} submit errors")
    print(f"Results saved to {output_file}")


# Run
asyncio.run(process_batch("./captcha_images"))

Résultat attendu :

Found 1000 images
Submitting...
Submitted: 997, Errors: 3
Polling for results...
Done: 985 solved, 12 failed, 3 submit errors
Results saved to results.csv

Node.js : processeur par lots du pool de travailleurs

const axios = require('axios');
const fs = require('fs');
const path = require('path');
const { createObjectCsvWriter } = require('csv-writer');

const API_KEY = 'YOUR_API_KEY';
const SUBMIT_URL = 'https://ocr.captchaai.com/in.php';
const RESULT_URL = 'https://ocr.captchaai.com/res.php';
const MAX_CONCURRENT = 20;
const POLL_INTERVAL_MS = 5000;

class BatchProcessor {
  constructor(concurrency = MAX_CONCURRENT) {
    this.concurrency = concurrency;
    this.results = [];
    this.processed = 0;
    this.total = 0;
  }

  async submitImage(imagePath) {
    const imgBase64 = fs.readFileSync(imagePath, { encoding: 'base64' });
    const resp = await axios.post(SUBMIT_URL, null, {
      params: {
        key: API_KEY,
        method: 'base64',
        body: imgBase64,
        json: 1,
      },
    });

    if (resp.data.status !== 1) {
      throw new Error(resp.data.request);
    }
    return resp.data.request;
  }

  async pollResult(taskId) {
    for (let i = 0; i < 24; i++) {
      await new Promise(r => setTimeout(r, POLL_INTERVAL_MS));
      const resp = await axios.get(RESULT_URL, {
        params: { key: API_KEY, action: 'get', id: taskId, json: 1 },
      });

      if (resp.data.status === 1) return resp.data.request;
      if (resp.data.request !== 'CAPCHA_NOT_READY') {
        throw new Error(resp.data.request);
      }
    }
    throw new Error('TIMEOUT');
  }

  async processOne(imagePath) {
    const startTime = Date.now();
    try {
      const taskId = await this.submitImage(imagePath);
      const answer = await this.pollResult(taskId);
      this.processed++;
      const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
      console.log(`[${this.processed}/${this.total}] ${path.basename(imagePath)}: ${answer} (${elapsed}s)`);
      return { file: imagePath, answer, solveTime: elapsed, error: '' };
    } catch (err) {
      this.processed++;
      return { file: imagePath, answer: '', solveTime: 0, error: err.message };
    }
  }

  async run(imageDir, outputFile = 'results.csv') {
    const files = fs.readdirSync(imageDir)
      .filter(f => /\.(png|jpg|jpeg|gif)$/i.test(f))
      .map(f => path.join(imageDir, f));

    this.total = files.length;
    console.log(`Processing ${this.total} images with ${this.concurrency} workers`);

    // Process in chunks
    for (let i = 0; i < files.length; i += this.concurrency) {
      const chunk = files.slice(i, i + this.concurrency);
      const chunkResults = await Promise.all(
        chunk.map(f => this.processOne(f))
      );
      this.results.push(...chunkResults);
    }

    // Write CSV
    const csvWriter = createObjectCsvWriter({
      path: outputFile,
      header: [
        { id: 'file', title: 'File' },
        { id: 'answer', title: 'Answer' },
        { id: 'solveTime', title: 'Solve Time (s)' },
        { id: 'error', title: 'Error' },
      ],
    });
    await csvWriter.writeRecords(this.results);

    const solved = this.results.filter(r => r.answer).length;
    console.log(`Done: ${solved}/${this.total} solved. Results: ${outputFile}`);
  }
}

const processor = new BatchProcessor(20);
processor.run('./captcha_images');

Traitement par lots tenant compte du débit

Évitez les erreurs 429 en suivant votre taux de soumission :

class RateLimiter:
    def __init__(self, max_per_second=10):
        self.max_per_second = max_per_second
        self.timestamps = []

    async def acquire(self):
        now = time.time()
        self.timestamps = [t for t in self.timestamps if now - t < 1.0]

        if len(self.timestamps) >= self.max_per_second:
            wait = 1.0 - (now - self.timestamps[0])
            if wait > 0:
                await asyncio.sleep(wait)

        self.timestamps.append(time.time())

# Use in submit loop
rate_limiter = RateLimiter(max_per_second=10)

async def submit_with_rate_limit(session, image_path):
    await rate_limiter.acquire()
    # ... submit as before

Suivi des progrès

import sys

class ProgressTracker:
    def __init__(self, total):
        self.total = total
        self.completed = 0
        self.solved = 0
        self.failed = 0
        self.start_time = time.time()

    def update(self, success=True):
        self.completed += 1
        if success:
            self.solved += 1
        else:
            self.failed += 1

        elapsed = time.time() - self.start_time
        rate = self.completed / elapsed if elapsed > 0 else 0
        eta = (self.total - self.completed) / rate if rate > 0 else 0

        sys.stdout.write(
            f"\r[{self.completed}/{self.total}] "
            f"Solved: {self.solved} | Failed: {self.failed} | "
            f"Rate: {rate:.1f}/s | ETA: {eta:.0f}s"
        )
        sys.stdout.flush()

Dépannage

Problème Parce que Corriger
429 réponses Trop de demandes simultanées Réduisez MAX_CONCURRENT_SUBMITS, ajoutez un limiteur de débit
De nombreux délais d'attente Sondage trop court ou images trop complexes Augmenter les tentatives d'interrogation ou l'intervalle d'interrogation
ERROR_ZERO_BALANCE mi-lot Le solde est épuisé Vérifiez le solde avant de commencer ; estimation du coût
Taux d'erreur élevé Images corrompues ou surdimensionnées Valider les images avant de les soumettre

FAQ

Combien d’images puis-je soumettre simultanément ?

20 à 30 soumissions simultanées fonctionnent bien. Au-delà, vous risquez d’atteindre les limites de taux. Utilisez un sémaphore pour limiter la concurrence.

Combien coûtent 1000 images ?

Vérifiez votre tarif actuel surcaptchaai.com. Les CAPTCHA Image/OCR font partie des types de résolution les moins chers.


Traitez des milliers de CAPTCHA avec CaptchaAI

Obtenez votre clé API surcaptchaai.com.


Guides associés

Les commentaires sont désactivés pour cet article.