<?php
/**
 * Category Products Webservice (Strict)
 * PHP 7+
 * GET:
 *   - url   : category URL (e.g. https://asangsm.com/product-category/.../)
 *   - pages : number of pages to scan (>=1)
 * Output: JSON { ok, source, count, products:[{name,url,page}], errors }
 */

declare(strict_types=1);
mb_internal_encoding('UTF-8');
header('Content-Type: application/json; charset=utf-8');

// ---------- utils ----------
function respond($data, int $code = 200) {
    http_response_code($code);
    echo json_encode($data, JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES|JSON_PRETTY_PRINT);
    exit;
}

function normalize_base_category_url(string $url): string {
    $url = trim($url);
    $url = filter_var($url, FILTER_SANITIZE_URL);
    // drop trailing /page/N
    $url = preg_replace('~(/page/\d+/?)(\?.*)?$~i', '/', $url);
    if ($url === '' || !preg_match('~^https?://~i', $url)) return '';
    return rtrim($url, '/') . '/';
}

function build_page_url(string $base, int $n): string {
    return ($n <= 1) ? $base : (rtrim($base, '/') . '/page/' . $n . '/');
}

function fetch_html(string $url, int $timeout = 20) {
    $ch = curl_init();
    curl_setopt_array($ch, [
        CURLOPT_URL            => $url,
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_MAXREDIRS      => 5,
        CURLOPT_CONNECTTIMEOUT => $timeout,
        CURLOPT_TIMEOUT        => $timeout,
        CURLOPT_SSL_VERIFYPEER => true,
        CURLOPT_SSL_VERIFYHOST => 2,
        CURLOPT_USERAGENT      => 'MajaziCategoryWS/1.1 (+php; crawler)',
        CURLOPT_HTTPHEADER     => ['Accept: text/html,application/xhtml+xml;q=0.9,*/*;q=0.8'],
    ]);
    $body = curl_exec($ch);
    $err  = curl_error($ch);
    $code = (int)curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
    curl_close($ch);
    if ($body === false || $code >= 400) {
        return ['ok' => false, 'error' => $err ?: ('HTTP '.$code), 'code' => $code];
    }
    return ['ok' => true, 'html' => $body, 'code' => $code];
}

function make_absolute_url(string $href, string $base): string {
    // already absolute?
    if (preg_match('~^https?://~i', $href)) return $href;
    $bp = parse_url($base);
    if (!$bp || empty($bp['scheme']) || empty($bp['host'])) return $href;
    $scheme = $bp['scheme'];
    $host   = $bp['host'];
    $port   = isset($bp['port']) ? ':' . $bp['port'] : '';
    if (strpos($href, '//') === 0) return $scheme . ':' . $href;
    if (substr($href, 0, 1) === '/') {
        return $scheme . '://' . $host . $port . $href;
    }
    // relative path
    $path = isset($bp['path']) ? $bp['path'] : '/';
    $path = preg_replace('~/[^/]*$~', '/', $path);
    $abs  = $path . $href;
    // resolve ../ and ./
    $parts = [];
    foreach (explode('/', $abs) as $seg) {
        if ($seg === '' || $seg === '.') continue;
        if ($seg === '..') { array_pop($parts); continue; }
        $parts[] = $seg;
    }
    return $scheme . '://' . $host . $port . '/' . implode('/', $parts) . (substr($href, -1) === '/' ? '/' : '');
}

/** فقط URLهای محصول خود سایت را قبول کن: host همان دسته و path با /product/ شروع شود */
function is_product_url(string $href, string $baseHost): bool {
    $p = parse_url($href);
    if (!$p || empty($p['scheme']) || empty($p['host']) || empty($p['path'])) return false;
    // همان دامنه یا ساب‌دامین‌هایش
    $h = strtolower($p['host']);
    $b = strtolower($baseHost);
    $same = ($h === $b) || (substr($h, -strlen('.'.$b)) === '.'.$b);
    if (!$same) return false;
    // مسیر حتماً با /product/ شروع شود
    return (strpos($p['path'], '/product/') === 0);
}

/** استخراج از HTML: فقط کارت‌های محصول، و فقط لینک‌های معتبر /product/ روی همان دامنه */
function extract_products_from_html(string $html, string $baseUrl): array {
    libxml_use_internal_errors(true);
    $doc = new DOMDocument();
    $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
    $doc->loadHTML($html);
    $xp = new DOMXPath($doc);

    $baseHost = parse_url($baseUrl, PHP_URL_HOST) ?: '';
    $results = [];
    $seen = [];

    // سِلِکتورهای رایج ووکامرس/قالب‌ها
    $queries = [
        "//a[contains(@class,'woocommerce-LoopProduct-link') and @href]",
        "//div[contains(@class,'product-item')]//h3/a[@href]",
        "//li[contains(@class,'product')]//a[@href]",
        "//div[contains(@class,'product')]//a[@href]"
    ];

    foreach ($queries as $q) {
        /** @var DOMElement $a */
        foreach ($xp->query($q) as $a) {
            $href = trim($a->getAttribute('href'));
            if ($href === '') continue;
            $href = html_entity_decode($href, ENT_QUOTES, 'UTF-8');
            $hrefAbs = make_absolute_url($href, $baseUrl);
            if (!is_product_url($hrefAbs, $baseHost)) continue; // حذف شبکه‌های اجتماعی و URLهای غیر /product/

            // نام محصول
            $name = trim($a->textContent);
            if ($name === '') {
                // تلاش برای h3/h2 نزدیک
                $h = $a->parentNode;
                while ($h && $h->nodeType === XML_ELEMENT_NODE) {
                    $try = null;
                    $hs = $h->getElementsByTagName('h3');
                    if ($hs->length > 0) $try = trim($hs->item(0)->textContent);
                    if (!$try) {
                        $hs2 = $h->getElementsByTagName('h2');
                        if ($hs2->length > 0) $try = trim($hs2->item(0)->textContent);
                    }
                    if ($try) { $name = $try; break; }
                    $h = $h->parentNode;
                }
            }
            $name = preg_replace('/\s{2,}/u', ' ', $name);

            // حذف سطرهای مزاحم مثل «فیلتر براساس قیمت :»
            if ($name === '' || preg_match('/^فیلتر\s+براساس\s+قیمت\s*:$/u', $name)) continue;

            $key = strtolower($hrefAbs);
            if (!isset($seen[$key])) {
                $results[] = ['name' => $name, 'url' => $hrefAbs];
                $seen[$key] = true;
            }
        }
    }

    return $results;
}

// ---------- inputs ----------
$baseUrl = isset($_GET['url']) ? (string)$_GET['url'] : '';
$pages   = isset($_GET['pages']) ? (int)$_GET['pages'] : 0;

if ($baseUrl === '' || $pages <= 0) {
    respond([
        'ok' => false,
        'error' => 'پارامترهای ورودی نامعتبرند. استفاده: ?url=https://asangsm.com/product-category/.../&pages=2'
    ], 400);
}

$baseUrl = normalize_base_category_url($baseUrl);
if ($baseUrl === '') {
    respond(['ok'=>false,'error'=>'URL نامعتبر است. باید با http/https شروع شود.'], 400);
}

// ---------- crawl ----------
$all = [];
$errs = [];
for ($i = 1; $i <= $pages; $i++) {
    $pageUrl = build_page_url($baseUrl, $i);
    $res = fetch_html($pageUrl, 25);
    if (!$res['ok']) {
        $errs[] = ['page'=>$i, 'url'=>$pageUrl, 'error'=>$res['error'] ?? ('HTTP '.$res['code'])];
        continue;
    }
    $items = extract_products_from_html($res['html'], $baseUrl);
    foreach ($items as $it) {
        $all[] = ['name'=>$it['name'], 'url'=>$it['url'], 'page'=>$i];
    }
}

// dedup by URL
$uniq = [];
$out  = [];
foreach ($all as $p) {
    $k = strtolower($p['url']);
    if (!isset($uniq[$k])) { $uniq[$k]=1; $out[]=$p; }
}

// ---------- response ----------
respond([
    'ok' => true,
    'source' => [
        'base_url' => $baseUrl,
        'pages'    => $pages,
        'note'     => 'Paging format assumed as /page/{n}/; page 1 uses base URL.'
    ],
    'count'    => count($out),
    'products' => $out,
    'errors'   => $errs,
]);
