extrac prices pdfco

This commit is contained in:
2025-11-17 15:05:10 +01:00
parent 6b65b31932
commit 1804605cad
16 changed files with 1131 additions and 330 deletions
+217
View File
@@ -0,0 +1,217 @@
<?php
header("Content-Type: application/json; charset=utf-8");
// ---------------------------------------------------------
// 1) CONFIGURAZIONE
// ---------------------------------------------------------
$apiKey = "info@claudiosironi.com_Qfh02D7sAvi2tcx3ZchHpusNaBquCKhJw81fEnkHe2ersQDVOex4IokhCCzaFAz1";
$fileToken = "filetoken://61a780917907f86a340290d22c449357dc68950e9066bd67b2";
// opzionale: passare un token via GET
if (isset($_GET['token']) && $_GET['token'] !== "") {
$fileToken = $_GET['token'];
}
// ---------------------------------------------------------
// 2) PDF.CO → JSON2 (TUTTE LE PAGINE)
// ---------------------------------------------------------
$endpoint = "https://api.pdf.co/v1/pdf/convert/to/json2";
$payload = [
"url" => $fileToken,
"inline" => true,
"detectTables" => true,
"pages" => "all"
];
$ch = curl_init($endpoint);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
"Content-Type: application/json",
"x-api-key: $apiKey"
]);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($payload));
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$response = curl_exec($ch);
if (!$response) {
echo json_encode(["error" => "Errore CURL: " . curl_error($ch)]);
exit;
}
curl_close($ch);
$json = json_decode($response, true);
// ---------------------------------------------------------
// 3) VALIDAZIONE
// ---------------------------------------------------------
if (!isset($json["body"]["document"]["page"])) {
echo json_encode([
"error" => true,
"message" => "PDF.co JSON senza pagine valide",
"raw" => $json
]);
exit;
}
$pages = $json["body"]["document"]["page"];
// ---------------------------------------------------------
// 4) FUNZIONI UTILI
// ---------------------------------------------------------
// Formato A 291 02F
function isCodeTypeA($txt)
{
return preg_match('/^\d{3}\s+[0-9A-Z]{2,3}$/', $txt);
}
// Formato C blocchi ripetuti (499 3A 14 5.475)
function extractTypeCBlocks($line)
{
preg_match_all(
'/(\d{3})\s+([0-9A-Z]{1,3})\s+(\d{2})\s+(\d{1,3}\.\d{3})/',
$line,
$m,
PREG_SET_ORDER
);
$out = [];
foreach ($m as $b) {
$out[] = [
"codice" => $b[1],
"variante" => $b[2],
"dimensione" => $b[3],
"prezzo" => floatval(str_replace(".", "", $b[4]))
];
}
return $out;
}
// ---------------------------------------------------------
// 5) PARSER PER UNA SINGOLA RIGA (ricostruita)
// ---------------------------------------------------------
function parseLine($line)
{
// FORMATO A
if (preg_match('/^(\d{3}\s+[0-9A-Z]{2,3})\s+(.*)$/', $line, $mA)) {
$codice = trim($mA[1]);
$resto = trim($mA[2]);
preg_match_all('/\d{1,3}[.,]\d{2,3}/', $resto, $matchesPrezzi);
if (count($matchesPrezzi[0]) >= 2) {
$descr = trim(preg_replace('/\d{1,3}[.,]\d{2,3}/', '', $resto));
$vals = array_map(function ($v) {
return floatval(str_replace(",", ".", str_replace(".", "", $v)));
}, $matchesPrezzi[0]);
return [
"type" => "A",
"codice" => $codice,
"descrizione" => $descr,
"prezzi" => $vals
];
}
}
// FORMATO B1
if (preg_match(
'/^(\d{3}\s+[A-Z]{1,3})\s+([A-Za-zÀ-ù0-9\s]+?)\s+((?:\d{1,3}[.,]\d{2,3}\s*){4,})$/',
$line,
$mB
)) {
$codice = trim($mB[1]);
$descr = trim($mB[2]);
$valuesString = trim($mB[3]);
preg_match_all('/\d{1,3}[.,]\d{2,3}/', $valuesString, $vv);
$vals = array_map(function ($v) {
return floatval(str_replace(",", ".", str_replace(".", "", $v)));
}, $vv[0]);
return [
"type" => "B1",
"codice" => $codice,
"descrizione" => $descr,
"prezzi" => $vals
];
}
// FORMATO C (ripetuto in riga)
$blocks = extractTypeCBlocks($line);
if (!empty($blocks)) {
$desc = $line;
foreach ($blocks as $b) {
$pattern = sprintf(
'/%s\s+%s\s+%s\s+\d{1,3}\.\d{3}/',
$b["codice"],
$b["variante"],
$b["dimensione"]
);
$desc = preg_replace($pattern, "", $desc);
}
$desc = trim($desc);
return [
"type" => "C",
"descrizione" => $desc,
"varianti" => $blocks
];
}
// Nessun match
return null;
}
// ---------------------------------------------------------
// 6) PROCESSAMENTO DI TUTTE LE PAGINE
// ---------------------------------------------------------
$items = [];
foreach ($pages as $p) {
if (!isset($p["row"])) continue;
// Ricostruisco tutte le vere righe (“flat lines”)
$flatLines = [];
foreach ($p["row"] as $row) {
$line = "";
foreach ($row["column"] as $col) {
if (!empty($col["text"]["text"])) {
$line .= " " . trim($col["text"]["text"]);
}
}
$line = trim($line);
if ($line !== "") {
$flatLines[] = $line;
}
}
// Applico i parser su ogni riga ricostruita
foreach ($flatLines as $l) {
$parsed = parseLine($l);
if ($parsed !== null) {
$items[] = $parsed;
}
}
}
// ---------------------------------------------------------
// 7) OUTPUT FINALE
// ---------------------------------------------------------
echo json_encode([
"status" => "ok",
"total" => count($items),
"items" => $items
], JSON_PRETTY_PRINT);