<?php
/**
 * Category Products Web Service (PHP 7.x) with Unicode URL support
 * Features:
 * - Fix nested/proxy style url params
 * - Accept Unicode URLs (Persian paths) and auto-encode them
 * - Concurrent fetch via curl_multi
 * - Extract product names/links via XPath (WooCommerce-friendly)
 * - Outputs: JSON (default), HTML (table), CSV
 * - CORS enabled; works with GET/POST
 */

declare(strict_types=1);
mb_internal_encoding('UTF-8');

// -------- CORS --------
header('Access-Control-Allow-Origin: *');
header('Access-Control-Allow-Methods: GET, POST, OPTIONS');
header('Access-Control-Allow-Headers: Content-Type, Accept, Origin, User-Agent');
if ($_SERVER['REQUEST_METHOD'] === 'OPTIONS') {
    http_response_code(204);
    exit;
}

// -------- polyfills for PHP 7 --------
function starts_with($haystack, $needle): bool {
    return $needle === '' || strncmp($haystack, $needle, strlen($needle)) === 0;
}
function contains($haystack, $needle): bool {
    return $needle !== '' && strpos($haystack, $needle) !== false;
}
function array_get($arr, $key, $default = null) {
    return isset($arr[$key]) ? $arr[$key] : $default;
}

// -------- Unicode URL helpers --------
function idn_ascii_or_same($host) {
    if ($host === '' || $host === null) return $host;
    if (function_exists('idn_to_ascii')) {
        $conv = idn_to_ascii($host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46);
        if ($conv !== false) return $conv;
    }
    return $host;
}
function normalize_unicode_url_to_ascii($u) {
    $parts = @parse_url($u);
    if ($parts === false) return $u;
    $scheme = isset($parts['scheme']) ? $parts['scheme'] : '';
    $host   = isset($parts['host']) ? $parts['host'] : '';
    $port   = isset($parts['port']) ? ':' . $parts['port'] : '';
    $path   = isset($parts['path']) ? $parts['path'] : '';
    $query  = isset($parts['query']) ? $parts['query'] : '';
    $frag   = isset($parts['fragment']) ? '#' . $parts['fragment'] : '';

    if ($host !== '') $host = idn_ascii_or_same($host);

    if ($path !== '') {
        $segments = explode('/', $path);
        foreach ($segments as &$seg) {
            if ($seg === '') continue;
            if (rawurldecode($seg) === $seg) {
                $seg = rawurlencode($seg);
            }
        }
        unset($seg);
        $path = implode('/', $segments);
    }

    $rebuilt = '';
    if ($scheme !== '') $rebuilt .= $scheme . '://';
    if ($host   !== '') $rebuilt .= $host;
    $rebuilt .= $port;
    $rebuilt .= $path;
    if ($query  !== '') $rebuilt .= '?' . $query;
    $rebuilt .= $frag;
    return $rebuilt;
}

// -------- helpers: output --------
function respondJson(array $data, int $status = 200): void {
    http_response_code($status);
    header('Content-Type: application/json; charset=utf-8');
    echo json_encode($data, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_PRETTY_PRINT);
    exit;
}
function respondHtml(array $products, array $meta): void {
    header('Content-Type: text/html; charset=utf-8');
    ?>
    <!doctype html><html lang="fa" dir="rtl"><head>
      <meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1">
      <title>خروجی محصولات</title>
      <style>
        body{font-family:Tahoma,Arial;margin:24px}
        table{border-collapse:collapse;width:100%}
        th,td{border:1px solid #ddd;padding:8px;text-align:right;vertical-align:top}
        th{background:#f4f4f4}
        .meta{margin-bottom:16px;color:#555}
        a{word-break:break-all}
      </style>
    </head><body>
      <h1>فهرست محصولات</h1>
      <div class="meta">
        <div><b>دسته:</b> <a href="<?=htmlspecialchars($meta['category_url'])?>" target="_blank" rel="noopener"><?=htmlspecialchars($meta['category_url'])?></a></div>
        <div><b>صفحات واکشی‌شده:</b> <?= (int)$meta['pages_crawled'] ?> از <?= (int)$meta['pages_requested'] ?></div>
        <div><b>تعداد محصول:</b> <?= count($products) ?></div>
      </div>
      <table><thead><tr><th>#</th><th>نام</th><th>لینک</th><th>صفحه</th></tr></thead><tbody>
      <?php $i=1; foreach($products as $p): ?>
        <tr>
          <td><?= $i++ ?></td>
          <td><?= htmlspecialchars($p['name'] ?: '(بدون‌عنوان)') ?></td>
          <td><a href="<?= htmlspecialchars($p['url']) ?>" target="_blank" rel="noopener"><?= htmlspecialchars($p['url']) ?></a></td>
          <td><?= (int)$p['page'] ?></td>
        </tr>
      <?php endforeach; ?>
      </tbody></table>
    </body></html>
    <?php
    exit;
}
function respondCsv(array $products): void {
    header('Content-Type: text/csv; charset=utf-8');
    header('Content-Disposition: attachment; filename="products.csv"');
    $out = fopen('php://output', 'w');
    fprintf($out, chr(0xEF).chr(0xBB).chr(0xBF)); // BOM
    fputcsv($out, ['name','url','page']);
    foreach ($products as $p) {
        fputcsv($out, [$p['name'], $p['url'], $p['page']]);
    }
    fclose($out);
    exit;
}

// -------- misc helpers --------
function normalizeWhitespace($s): string {
    $s = str_replace("\xC2\xA0", ' ', $s);
    $s = preg_replace('/\s+/u', ' ', $s);
    return trim((string)$s);
}
function tryExtractInnerParams($raw): array {
    $raw = normalizeWhitespace($raw);
    $raw = ltrim($raw);
    $qpos = strpos($raw, '?');
    if ($qpos === false) return ['url' => $raw, 'pages' => null];
    parse_str(parse_url($raw, PHP_URL_QUERY), $inner);
    $innerUrl   = array_get($inner, 'url', null);
    $innerPages = array_get($inner, 'pages', null);
    if ($innerUrl) {
        return ['url' => normalizeWhitespace($innerUrl), 'pages' => $innerPages !== null ? (int)$innerPages : null];
    }
    return ['url' => $raw, 'pages' => null];
}
function ensureScheme($u): string {
    if (preg_match('#^[a-z][a-z0-9+\-.]*://#i', $u)) return $u;
    return 'https://' . ltrim($u, '/');
}
function buildPageUrl(string $baseUrl, int $page): string {
    $parts = parse_url($baseUrl);
    $query = [];
    if (!empty($parts['query'])) parse_str($parts['query'], $query);
    if ($page <= 1) unset($query['page']); else $query['page'] = (string)$page;
    $scheme   = isset($parts['scheme']) ? $parts['scheme'] : 'https';
    $host     = isset($parts['host']) ? $parts['host'] : '';
    $port     = isset($parts['port']) ? ':' . $parts['port'] : '';
    $path     = isset($parts['path']) ? $parts['path'] : '/';
    $newQuery = http_build_query($query);
    $frag     = isset($parts['fragment']) ? '#' . $parts['fragment'] : '';
    return $scheme . '://' . $host . $port . $path . ($newQuery ? '?' . $newQuery : '') . $frag;
}
function absolutize(string $base, string $href): string {
    if ($href === '') return $base;
    if (starts_with($href, '//')) {
        $scheme = parse_url($base, PHP_URL_SCHEME);
        if (!$scheme) $scheme = 'https';
        return $scheme . ':' . $href;
    }
    if (preg_match('#^[a-z][a-z0-9+\-.]*://#i', $href)) return $href;
    $baseParts = parse_url($base);
    $scheme = isset($baseParts['scheme']) ? $baseParts['scheme'] : 'https';
    $host   = isset($baseParts['host']) ? $baseParts['host'] : '';
    $port   = isset($baseParts['port']) ? ':' . $baseParts['port'] : '';
    $basePath = isset($baseParts['path']) ? $baseParts['path'] : '/';
    if (starts_with($href, '/')) $path = $href;
    else {
        $dir = preg_replace('#/[^/]*$#', '/', $basePath);
        $path = $dir . $href;
    }
    $segments = [];
    foreach (explode('/', $path) as $seg) {
        if ($seg === '' || $seg === '.') continue;
        if ($seg === '..') { array_pop($segments); continue; }
        $segments[] = $seg;
    }
    return $scheme . '://' . $host . $port . '/' . implode('/', $segments);
}
function cleanText(string $t): string {
    $t = html_entity_decode($t, ENT_QUOTES | ENT_HTML5, 'UTF-8');
    $t = preg_replace('/\s+/u', ' ', trim($t));
    return $t;
}

// -------- product extraction --------
function extractProducts(string $html, string $baseUrl, string $productPathHint, int $pageNo): array {
    $doc = new DOMDocument();
    libxml_use_internal_errors(true);
    if (!starts_with($html, '<?xml')) $html = '<?xml encoding="utf-8" ?>' . $html;
    @$doc->loadHTML($html);
    libxml_clear_errors();
    $xp = new DOMXPath($doc);
    $hint = htmlspecialchars($productPathHint, ENT_QUOTES | ENT_HTML5);
    $queries = [
        "//a[contains(@href, '$hint')]",
        "//li[contains(@class,'product')]//a[contains(@class,'product') or contains(@class,'woocommerce-LoopProduct-link') or contains(@href,'/product/')]",
        "//a[contains(translate(@class,'TITLE','title'),'title') or contains(translate(@class,'PRODUCT','product'),'product')]",
    ];
    $seen = [];
    $out  = [];
    foreach ($queries as $q) {
        $nodes = $xp->query($q);
        if (!$nodes) continue;
        foreach ($nodes as $a) {
            /** @var DOMElement $a */
            $href = $a->getAttribute('href') ?: '';
            if ($href === '') continue;
            $url = absolutize($baseUrl, $href);
            $hostBase = parse_url($baseUrl, PHP_URL_HOST);
            $hostUrl  = parse_url($url, PHP_URL_HOST);
            if ($hostUrl && $hostBase && $hostUrl !== $hostBase) continue;
            $pathUrl = parse_url($url, PHP_URL_PATH);
            $pathUrl = $pathUrl ? $pathUrl : '';
            $looksProduct = contains($pathUrl, $productPathHint) || preg_match('#/(product|products|item|shop)/#i', $pathUrl);
            if (!$looksProduct) continue;
            $name = cleanText($a->textContent ? $a->textContent : '');
            if ($name === '' && $a->hasAttribute('title')) $name = cleanText($a->getAttribute('title'));
            if ($name === '') {
                foreach (['.//h1','.//h2','.//h3','.//span'] as $sub) {
                    $subNodes = $xp->query($sub, $a);
                    if ($subNodes && $subNodes->length) {
                        $name = cleanText($subNodes->item(0)->textContent ? $subNodes->item(0)->textContent : '');
                        if ($name !== '') break;
                    }
                }
            }
            $key = strtolower($url);
            if (isset($seen[$key])) continue;
            $seen[$key] = true;
            $out[] = ['name' => $name, 'url' => $url, 'page' => $pageNo];
        }
    }
    return $out;
}

// -------- input parsing --------
$input = $_SERVER['REQUEST_METHOD'] === 'POST' ? $_POST : $_GET;
$rawUrl  = array_get($input, 'url', '');
$rawUrl  = normalizeWhitespace(rawurldecode($rawUrl));
$format  = strtolower((string)array_get($input, 'format', 'json'));
$pagesIn = array_get($input, 'pages', null);
$productPathHint = (string)array_get($input, 'productPathHint', '/product/');
$timeout = (int)array_get($input, 'timeout', 20);
$ua = (string)array_get($input, 'ua', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125 Safari/537.36 (CategoryScraper/1.1)');

$inner = tryExtractInnerParams($rawUrl);
$catUrl = normalizeWhitespace(rawurldecode($inner['url']));
if (!preg_match('#^[a-z][a-z0-9+\-.]*://#i', $catUrl)) $catUrl = ensureScheme($catUrl);
$catUrlAscii = normalize_unicode_url_to_ascii($catUrl);

$pages = (int)($pagesIn !== null ? $pagesIn : ($inner['pages'] !== null ? $inner['pages'] : 1));
if ($pages < 1) $pages = 1;

if (!filter_var($catUrlAscii, FILTER_VALIDATE_URL)) {
    respondJson([
        'error' => 'پارامتر url الزامی و باید یک URL معتبر باشد.',
        'hint'  => 'نمونه صحیح: exo.php?url=' . rawurlencode('https://exo.ir/category/کارت-گرافیک') . '&pages=3',
        'received_url' => $rawUrl
    ], 400);
}
if ($pages < 1 || $pages > 100) {
    respondJson(['error' => 'پارامتر pages باید بین 1 تا 100 باشد.'], 400);
}

// -------- fetch concurrently --------
$mh = curl_multi_init();
$handles = [];
$responses = [];
$errors = [];

for ($i = 1; $i <= $pages; $i++) {
    $pageUrl = buildPageUrl($catUrlAscii, $i);
    $ch = curl_init($pageUrl);
    $opts = [
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_MAXREDIRS      => 5,
        CURLOPT_CONNECTTIMEOUT => 10,
        CURLOPT_TIMEOUT        => $timeout,
        CURLOPT_ENCODING       => '',
        CURLOPT_HTTPHEADER     => [
            'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language: fa-IR,fa;q=0.9,en;q=0.8',
            'Cache-Control: no-cache',
        ],
        CURLOPT_USERAGENT      => $ua,
        CURLOPT_SSL_VERIFYPEER => true,
        CURLOPT_SSL_VERIFYHOST => 2,
    ];
    curl_setopt_array($ch, $opts);
    curl_multi_add_handle($mh, $ch);
    $handles[(string)$ch] = ['handle' => $ch, 'page' => $i, 'url' => $pageUrl];
}

do {
    $status = curl_multi_exec($mh, $running);
    if ($running) curl_multi_select($mh, 1.0);
} while ($running && $status == CURLM_OK);

foreach ($handles as $h) {
    $ch = $h['handle'];
    $body = curl_multi_getcontent($ch);
    $err  = curl_error($ch);
    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_multi_remove_handle($mh, $ch);
    curl_close($ch);
    if ($err || $code >= 400 || $body === false || trim((string)$body) === '') {
        $errors[] = [
            'page' => $h['page'],
            'url'  => $h['url'],
            'http_status' => $code,
            'error' => $err ?: 'empty body or HTTP error',
        ];
        continue;
    }
    $responses[$h['page']] = $body;
}
curl_multi_close($mh);

// -------- parse --------
ksort($responses);
$allProducts = [];
$seenUrl = [];
foreach ($responses as $pageNo => $html) {
    $products = extractProducts($html, buildPageUrl($catUrlAscii, (int)$pageNo), $productPathHint, (int)$pageNo);
    foreach ($products as $p) {
        $key = strtolower($p['url']);
        if (isset($seenUrl[$key])) continue;
        $seenUrl[$key] = true;
        $allProducts[] = $p;
    }
}

// -------- result & output --------
$result = [
    'input' => [
        'category_url_raw'  => $catUrl,       // ورودی اصلی (ممکن است یونیکد)
        'category_url'      => $catUrlAscii,  // نسخه ASCII که درخواست‌ها با آن انجام شد
        'pages_requested'   => $pages,
        'product_path_hint' => $productPathHint,
        'format'            => $format
    ],
    'pages_crawled' => count($responses),
    'products_count'=> count($allProducts),
    'products'      => $allProducts,
    'errors'        => $errors,
    'ts'            => date('c'),
];

if ($format === 'html') {
    respondHtml($allProducts, [
        'category_url'    => $catUrl,
        'pages_crawled'   => $result['pages_crawled'],
        'pages_requested' => $pages,
    ]);
} elseif ($format === 'csv') {
    respondCsv($allProducts);
} else {
    respondJson($result);
}