import fitz from shapely.geometry import Polygon from shapely.validation import make_valid import math POINT_TO_MM = 25.4 / 72.0 def point_to_tuple(point): return float(point.x), float(point.y) def distance(p1, p2): return math.hypot(p1[0] - p2[0], p1[1] - p2[1]) def rect_to_polygon(rect): return [ (float(rect.x0), float(rect.y0)), (float(rect.x1), float(rect.y0)), (float(rect.x1), float(rect.y1)), (float(rect.x0), float(rect.y1)), (float(rect.x0), float(rect.y0)), ] def cubic_bezier_points(p0, p1, p2, p3, steps=32): points = [] for i in range(1, steps + 1): t = i / steps x = ( (1 - t) ** 3 * p0[0] + 3 * (1 - t) ** 2 * t * p1[0] + 3 * (1 - t) * t ** 2 * p2[0] + t ** 3 * p3[0] ) y = ( (1 - t) ** 3 * p0[1] + 3 * (1 - t) ** 2 * t * p1[1] + 3 * (1 - t) * t ** 2 * p2[1] + t ** 3 * p3[1] ) points.append((x, y)) return points def polygon_area_mm2(points, scale_ratio=1.0): polygon = Polygon(points) if not polygon.is_valid: polygon = make_valid(polygon) if polygon.is_empty: return None area_points2 = abs(float(polygon.area)) area_mm2 = area_points2 * (POINT_TO_MM ** 2) area_mm2 = area_mm2 / (scale_ratio ** 2) return area_mm2 def get_bounds_mm(points, scale_ratio=1.0): polygon = Polygon(points) bounds = polygon.bounds x_min, y_min, x_max, y_max = bounds width_points = x_max - x_min height_points = y_max - y_min width_mm = width_points * POINT_TO_MM / scale_ratio height_mm = height_points * POINT_TO_MM / scale_ratio return { "x_min": x_min, "y_min": y_min, "x_max": x_max, "y_max": y_max, "width_mm": width_mm, "height_mm": height_mm, } def extract_points_from_drawing(drawing): points = [] source_type = "path" for item in drawing.get("items", []): command = item[0] if command == "l": p1 = point_to_tuple(item[1]) p2 = point_to_tuple(item[2]) if not points: points.append(p1) if distance(points[-1], p1) > 0.01: points.append(p1) points.append(p2) elif command == "re": rect = item[1] source_type = "rectangle" return rect_to_polygon(rect), source_type elif command == "c": # PyMuPDF cubic item is normally: # ("c", start_point, control_1, control_2, end_point) if len(item) >= 5: p0 = point_to_tuple(item[1]) p1 = point_to_tuple(item[2]) p2 = point_to_tuple(item[3]) p3 = point_to_tuple(item[4]) if not points: points.append(p0) elif distance(points[-1], p0) > 0.01: points.append(p0) points.extend(cubic_bezier_points(p0, p1, p2, p3, steps=32)) return points, source_type def is_closed(points, tolerance_points=1.5): if len(points) < 4: return False return distance(points[0], points[-1]) <= tolerance_points def is_simple_rectangle(points, source_type): if source_type == "rectangle": return True # Most CAD frames, dimension boxes and table lines become 5-point rectangles. if len(points) <= 5: return True return False def reject_reason(points, page_rect, source_type, area_mm2, scale_ratio=1.0): if len(points) < 6: return "too_few_points" if not is_closed(points): return "not_closed" if is_simple_rectangle(points, source_type): return "rectangle_or_box" if area_mm2 is None or area_mm2 <= 0: return "zero_area" bounds = get_bounds_mm(points, scale_ratio) width_mm = bounds["width_mm"] height_mm = bounds["height_mm"] if width_mm <= 0 or height_mm <= 0: return "invalid_bounds" # Reject thin long rectangles/lines: # this is exactly what was happening on Zodiac: # a long frame/table line was selected as area. min_side = min(width_mm, height_mm) max_side = max(width_mm, height_mm) if min_side < 1.0: return "thin_line_or_stroke" if max_side / min_side > 80: return "extreme_aspect_ratio" # Reject page frames / title blocks. page_area_mm2 = (page_rect.width * POINT_TO_MM) * (page_rect.height * POINT_TO_MM) if area_mm2 > page_area_mm2 * 0.05: return "too_large_page_element" # Reject text glyphs / arrows / tiny symbols. if area_mm2 < 20: return "too_small_detail" # Reasonable technical-section limits for this first version. # We can later make these user-configurable. if width_mm > 250 or height_mm > 250: return "too_large_for_profile" return None def candidate_score(candidate): """ Higher score = more plausible rubber/profile section. This does not guarantee correctness, but avoids obvious false positives. """ area = candidate["area_mm2"] width = candidate["width_mm"] height = candidate["height_mm"] min_side = min(width, height) max_side = max(width, height) aspect = max_side / min_side if min_side > 0 else 9999 score = 0 # Prefer meaningful areas. if area >= 50: score += 20 if area >= 100: score += 20 if area >= 300: score += 10 # Penalize strange aspect ratios. if aspect <= 10: score += 20 elif aspect <= 25: score += 5 else: score -= 20 # Penalize very large bounding boxes. if width > 120 or height > 120: score -= 10 return score def calculate_pdf_vector_area(pdf_bytes, filename="uploaded.pdf", scale_ratio=1.0): doc = fitz.open(stream=pdf_bytes, filetype="pdf") if len(doc) == 0: return { "success": False, "message": "PDF has no pages" } page = doc[0] drawings = page.get_drawings() diagnostics = { "filename": filename, "pages": len(doc), "page_width_points": float(page.rect.width), "page_height_points": float(page.rect.height), "drawings_count": len(drawings), "scale_ratio_used": scale_ratio, "raw_closed_candidates_count": 0, "accepted_candidates_count": 0, "rejected_candidates_count": 0, } if len(drawings) == 0: return { "success": False, "message": "No vector drawings found. This PDF may be raster/scanned.", "confidence": "low", "diagnostics": diagnostics } accepted_candidates = [] rejected_candidates = [] for index, drawing in enumerate(drawings): points, source_type = extract_points_from_drawing(drawing) if len(points) < 4: continue closed = is_closed(points) if closed: diagnostics["raw_closed_candidates_count"] += 1 area_mm2 = None if closed: area_mm2 = polygon_area_mm2(points, scale_ratio=scale_ratio) bounds_data = None if closed and area_mm2 is not None and area_mm2 > 0: bounds_data = get_bounds_mm(points, scale_ratio=scale_ratio) reason = reject_reason( points=points, page_rect=page.rect, source_type=source_type, area_mm2=area_mm2, scale_ratio=scale_ratio ) candidate = { "drawing_index": index, "source_type": source_type, "drawing_type": drawing.get("type"), "points_count": len(points), "area_mm2": round(area_mm2, 6), "area_cm2": round(area_mm2 / 100.0, 6), "area_m2": round(area_mm2 / 1_000_000.0, 9), "width_mm": round(bounds_data["width_mm"], 3), "height_mm": round(bounds_data["height_mm"], 3), "bounds_points": { "x_min": bounds_data["x_min"], "y_min": bounds_data["y_min"], "x_max": bounds_data["x_max"], "y_max": bounds_data["y_max"], }, "fill": drawing.get("fill"), "color": drawing.get("color"), } if reason is None: candidate["score"] = candidate_score(candidate) accepted_candidates.append(candidate) else: candidate["rejected_reason"] = reason # Keep only useful rejected diagnostics, not thousands of tiny glyphs. if len(rejected_candidates) < 80: rejected_candidates.append(candidate) diagnostics["accepted_candidates_count"] = len(accepted_candidates) diagnostics["rejected_candidates_count"] = len(rejected_candidates) accepted_candidates.sort(key=lambda item: item["score"], reverse=True) if not accepted_candidates: return { "success": False, "message": ( "No reliable closed profile found. " "False positives such as rectangles, frames, dimension lines and text were rejected. " "This PDF probably needs stitched-contour reconstruction." ), "confidence": "low", "diagnostics": diagnostics, "rejected_candidates_preview": rejected_candidates[:30] } best = accepted_candidates[0] area_mm2 = best["area_mm2"] # In this MVP, even accepted candidates need validation. # We do not want to present a wrong number as final production data. confidence = "needs_validation" return { "success": True, "message": ( "Candidate found after rejecting rectangles, frames and thin lines. " "Validate the selected candidate before using it as final area." ), "area_mm2": round(area_mm2, 6), "area_cm2": round(area_mm2 / 100.0, 6), "area_m2": round(area_mm2 / 1_000_000.0, 9), "scale_detected": f"{scale_ratio}:1 manual/default", "confidence": confidence, "selected_candidate": best, "diagnostics": diagnostics, "accepted_candidates_preview": accepted_candidates[:20], "rejected_candidates_preview": rejected_candidates[:30] }