<?php
/**
 * Simple WooCommerce Category Scraper → JSON API
 * Inputs (GET):
 *   - category_url (string) e.g. https://hapetteam.com/product-category/food-dog/
 *   - pages (int) e.g. 2  (number of pages to crawl, starting from page 1)
 *
 * Output (application/json; charset=utf-8):
 * {
 *   "category_url": "...",
 *   "pages_requested": 2,
 *   "pages_crawled": 2,
 *   "products_count": 12,
 *   "products": [
 *      {"title": "...", "url": "...", "page": 1, "position": 1},
 *      ...
 *   ],
 *   "errors": [...]
 * }
 */

header('Content-Type: application/json; charset=utf-8');

$categoryUrl = isset($_GET['category_url']) ? trim($_GET['category_url']) : '';
$pages       = isset($_GET['pages']) ? (int) $_GET['pages'] : 1;

if ($categoryUrl === '' || !filter_var($categoryUrl, FILTER_VALIDATE_URL)) {
    http_response_code(400);
    echo json_encode(["error" => "Invalid or missing 'category_url'"]);
    exit;
}
if ($pages < 1) $pages = 1;

/** Ensure trailing slash */
if (substr($categoryUrl, -1) !== '/') $categoryUrl .= '/';

/** Fetch helper with cURL */
function http_get($url) {
    $ch = curl_init($url);
    curl_setopt_array($ch, [
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_MAXREDIRS      => 5,
        CURLOPT_TIMEOUT        => 20,
        CURLOPT_CONNECTTIMEOUT => 10,
        CURLOPT_SSL_VERIFYHOST => 2,
        CURLOPT_SSL_VERIFYPEER => true,
        CURLOPT_USERAGENT      => 'Mozilla/5.0 (compatible; WooCategoryScraper/1.0; +https://example.com)',
        CURLOPT_HTTPHEADER     => [
            'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language: fa-IR,fa;q=0.9,en-US;q=0.8,en;q=0.7',
        ],
    ]);
    $body = curl_exec($ch);
    $err  = curl_error($ch);
    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);
    return [$body, $code, $err];
}

/** Resolve relative → absolute URL */
function resolve_url($href, $base) {
    if (!$href) return '';
    // already absolute
    if (parse_url($href, PHP_URL_SCHEME)) return $href;
    // handle protocol-relative
    if (strpos($href, '//') === 0) {
        $scheme = parse_url($base, PHP_URL_SCHEME) ?: 'https';
        return $scheme . ':' . $href;
    }
    // build absolute using base components
    $p = parse_url($base);
    if (!$p) return $href;
    $scheme = $p['scheme'] ?? 'https';
    $host   = $p['host'] ?? '';
    $port   = isset($p['port']) ? ':' . $p['port'] : '';
    $path   = isset($p['path']) ? $p['path'] : '/';
    // directory of base
    $path = preg_replace('#/[^/]*$#', '/', $path);
    // dot segments
    $abs = $href[0] === '/' ? $href : $path . $href;
    $abs = preg_replace('#(/\.?/)#', '/', $abs);
    while (preg_match('#/(?!\.\.)[^/]+/\.\./#', $abs)) {
        $abs = preg_replace('#/(?!\.\.)[^/]+/\.\./#', '/', $abs);
    }
    return $scheme . '://' . $host . $port . $abs;
}

/** Parse a page HTML and extract product title+link with multiple robust XPaths */
function extract_products($html, $pageUrl, $pageNo, &$errors) {
    $list = [];

    libxml_use_internal_errors(true);
    $dom = new DOMDocument();
    if (!$dom->loadHTML('<?xml encoding="utf-8" ?>' . $html)) {
        $errors[] = "DOM parse failed for $pageUrl";
        return $list;
    }
    libxml_clear_errors();

    $xp = new DOMXPath($dom);

    // Preferred (Woodmart/Elementor): <h3 class="wd-entities-title"><a ...>Title</a></h3>
    $xpaths = [
        "//h3[contains(@class,'wd-entities-title')]/a",
        // Generic WooCommerce (many themes)
        "//ul[contains(@class,'products')]//li[contains(@class,'product')]//a[contains(@class,'woocommerce-LoopProduct-link')]",
        // Fallback: product tiles with title anchors anywhere inside
        "//*[contains(@class,'product')][contains(@class,'type-product')]//a[contains(@href,'/product/')][normalize-space(string())!='']",
    ];

    $seen = [];

    foreach ($xpaths as $query) {
        foreach ($xp->query($query) as $a) {
            /** @var DOMElement $a */
            $title = trim(preg_replace('/\s+/u', ' ', $a->textContent));
            $href  = $a->getAttribute('href');

            if ($title === '') {
                // sometimes title is only in aria-label
                $aria = $a->getAttribute('aria-label');
                if ($aria) $title = trim($aria);
            }
            if ($href === '') continue;

            $absHref = resolve_url($href, $pageUrl);
            $key = $absHref;

            if (!isset($seen[$key])) {
                $seen[$key] = true;
                $list[] = [
                    "title"    => $title,
                    "url"      => $absHref,
                    "page"     => $pageNo,
                    "position" => count($seen) // rough order within page (after de-dupe)
                ];
            }
        }
        // if we already found items, we can skip trying weaker fallbacks
        if (!empty($list)) break;
    }

    // If still nothing found, try anchors that look like product cards (very defensive)
    if (empty($list)) {
        foreach ($xp->query("//a[contains(@class,'product-image-link') or contains(@href,'/product/')]") as $a) {
            $title = trim(preg_replace('/\s+/u', ' ', $a->textContent));
            if ($title === '') {
                $title = trim($a->getAttribute('title') ?: $a->getAttribute('aria-label'));
            }
            $href = $a->getAttribute('href');
            if (!$href) continue;
            $absHref = resolve_url($href, $pageUrl);
            $list[] = [
                "title"    => $title ?: "(بدون عنوان قابل‌خواندن)",
                "url"      => $absHref,
                "page"     => $pageNo,
                "position" => count($list) + 1
            ];
        }
    }

    return $list;
}

/** Crawl pages */
$all = [];
$errors = [];
$dedupe = [];

for ($i = 1; $i <= $pages; $i++) {
    $url = ($i === 1) ? $categoryUrl : rtrim($categoryUrl, '/') . "/page/$i/";
    list($html, $code, $err) = http_get($url);

    if ($err || $code !== 200 || !$html) {
        $errors[] = "Fetch error for $url (HTTP $code): " . ($err ?: 'empty body');
        continue;
    }

    $items = extract_products($html, $url, $i, $errors);

    foreach ($items as $p) {
        // de-duplicate by URL across pages
        if (!isset($dedupe[$p['url']])) {
            $dedupe[$p['url']] = true;
            $all[] = $p;
        }
    }
}

/** JSON response */
echo json_encode([
    "category_url"   => $categoryUrl,
    "pages_requested"=> $pages,
    "pages_crawled"  => $pages, // we attempt all; see errors for failures
    "products_count" => count($all),
    "products"       => array_values($all),
    "errors"         => $errors
], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_PRETTY_PRINT);