{"access":{"authentication":{"format":"L402 {macaroon}:{proof_of_payment}","header":"Authorization","protocols":["L402","MPP"],"format_mpp":"Payment <base64url_json>"},"endpoint":"https://sats4ai.com/api/l402/detect-objects","method":"POST"},"content_type":"api","cover_url":"","description":"Detect any object in an image from a text description. Uses Grounding DINO (ECCV 2024). Returns bounding box coordinates, labels, and confidence scores as structured JSON. LLMs can describe images but can't output precise pixel coordinates. 5 sats per image, no API key or signup needed.\n\nSend the image and query in the body:\n```\n{ \"image\": \"<base64 or data URI>\", \"query\": \"person, car, dog\" }\n```","name":"Object Detection","pricing":[{"tier":"Standard","amount":5,"currency":"SATS"}],"quality":{"model":"Grounding DINO","paper":"Grounding DINO: Marrying DINO with Grounded Pre-Training (ECCV 2024)","benchmark":"COCO zero-shot","scores":{"AP":52.5},"source":"https://paperswithcode.com/paper/grounding-dino-marrying-dino-with-grounded"},"response_body":{"detections":{"type":"array","description":"Array of { label, confidence, box: [x1,y1,x2,y2] }"}},"request_body":{"image":{"type":"string","required":true,"description":"Base64-encoded image or data URI"},"query":{"type":"string","required":true,"description":"Comma separated object names to detect"},"box_threshold":{"type":"number","required":false,"description":"Detection confidence threshold, 0-1, default 0.25"},"text_threshold":{"type":"number","required":false,"description":"Text matching threshold, 0-1, default 0.25"}}}