<?php
/**
 * Simple WooCommerce Category Scraper
 * Input:  category_url, pages, [format=json|html]
 * Output: list of products: name + url
 *
 * Usage examples:
 *  - HTML:  /category-scraper.php?category_url=https://korians.com/product-category/clothing/woman-dress/&pages=2
 *  - JSON:  /category-scraper.php?category_url=https://korians.com/product-category/clothing/woman-dress/&pages=2&format=json
 *
 * Notes:
 *  - Please respect the site's robots.txt and terms.
 *  - Adjust SELECTORS below if theme markup differs.
 */

mb_internal_encoding('UTF-8');

function respond_json($data, $status = 200) {
    http_response_code($status);
    header('Content-Type: application/json; charset=utf-8');
    echo json_encode($data, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_PRETTY_PRINT);
    exit;
}

function h($s) { return htmlspecialchars($s ?? '', ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8'); }

function normalize_url($url) {
    $url = trim($url);
    // remove url fragments & spaces
    $url = preg_replace('/\s+/', '', $url);
    // Ensure trailing slash for pretty permalinks
    if (!preg_match('~[?]~', $url) && substr($url, -1) !== '/') {
        $url .= '/';
    }
    return $url;
}

function build_page_url($base, $page) {
    if ($page <= 1) return $base;
    // prefer /page/{n}/
    if (substr($base, -1) !== '/') $base .= '/';
    return rtrim($base, '/') . '/page/' . intval($page) . '/';
}

function fetch_url($url) {
    $ch = curl_init();
    curl_setopt_array($ch, [
        CURLOPT_URL => $url,
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_MAXREDIRS => 5,
        CURLOPT_CONNECTTIMEOUT => 10,
        CURLOPT_TIMEOUT => 20,
        CURLOPT_ENCODING => '', // accept gzip/deflate
        CURLOPT_USERAGENT => 'Mozilla/5.0 (compatible; CategoryScraper/1.0; +https://example.com/bot)',
        CURLOPT_SSL_VERIFYPEER => true,
        CURLOPT_SSL_VERIFYHOST => 2,
        CURLOPT_HEADER => true,
    ]);
    $resp = curl_exec($ch);
    if ($resp === false) {
        $err = curl_error($ch);
        curl_close($ch);
        return [null, 0, [], $err];
    }
    $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
    $status = curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
    $headers_raw = substr($resp, 0, $header_size);
    $body = substr($resp, $header_size);
    curl_close($ch);

    // parse headers
    $headers = [];
    foreach (explode("\r\n", $headers_raw) as $line) {
        if (strpos($line, ':') !== false) {
            [$k, $v] = array_map('trim', explode(':', $line, 2));
            $headers[strtolower($k)] = $v;
        }
    }
    return [$body, $status, $headers, null];
}

function absolute_url($href, $base) {
    // if already absolute
    if (preg_match('~^https?://~i', $href)) return $href;
    // protocol-relative
    if (strpos($href, '//') === 0) {
        $scheme = parse_url($base, PHP_URL_SCHEME) ?: 'https';
        return $scheme . ':' . $href;
    }
    // handle root-relative or relative
    $parts = parse_url($base);
    if (!$parts) return $href;
    $scheme = $parts['scheme'] ?? 'https';
    $host   = $parts['host'] ?? '';
    $port   = isset($parts['port']) ? ':' . $parts['port'] : '';
    $path   = isset($parts['path']) ? $parts['path'] : '/';
    if ($href && $href[0] === '/') {
        $path = $href;
    } else {
        $dir = preg_match('~/~', $path) ? preg_replace('~/[^/]*$~', '/', $path) : '/';
        $path = $dir . $href;
    }
    // resolve ./ and ../
    $segments = [];
    foreach (explode('/', $path) as $seg) {
        if ($seg === '' || $seg === '.') continue;
        if ($seg === '..') { array_pop($segments); continue; }
        $segments[] = $seg;
    }
    return $scheme . '://' . $host . $port . '/' . implode('/', $segments) . (substr($href, -1) === '/' ? '/' : '');
}

function parse_products($html, $base_url) {
    // --- SELECTORS (adjust if theme differs) ---
    // Link selector (WooCommerce loop):
    //   a.woocommerce-LoopProduct-link   OR   li.product a:not(.add_to_cart_button)
    // Title selector fallback:
    //   h2.woocommerce-loop-product__title, attribute aria-label, img[alt]
    $dom = new DOMDocument();
    libxml_use_internal_errors(true);
    $loaded = $dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
    libxml_clear_errors();
    if (!$loaded) return [];

    $xpath = new DOMXPath($dom);
    $results = [];

    // Prefer explicit WooCommerce loop anchor
    $anchors = $xpath->query("//a[contains(concat(' ', normalize-space(@class), ' '), ' woocommerce-LoopProduct-link ')]");
    if ($anchors->length === 0) {
        // Fallback: any link inside a product card but not "add to cart"
        $anchors = $xpath->query("//li[contains(concat(' ', normalize-space(@class), ' '), ' product ')]//a[not(contains(@class,'add_to_cart'))]");
    }

    foreach ($anchors as $a) {
        /** @var DOMElement $a */
        $href = trim($a->getAttribute('href'));
        if (!$href) continue;
        $url = absolute_url($href, $base_url);

        // Try to find the closest title
        $title = '';
        // direct child heading
        $h2 = $xpath->query(".//h2[contains(@class,'woocommerce-loop-product__title')]", $a)->item(0);
        if ($h2 && trim($h2->textContent)) {
            $title = trim($h2->textContent);
        }
        if (!$title) {
            // sometimes title adjacent in parent li
            $parentLi = $a->parentNode;
            if ($parentLi instanceof DOMElement) {
                $h2b = $xpath->query(".//h2[contains(@class,'woocommerce-loop-product__title')]", $parentLi)->item(0);
                if ($h2b) $title = trim($h2b->textContent);
            }
        }
        if (!$title) {
            $aria = trim($a->getAttribute('aria-label'));
            if ($aria) $title = $aria;
        }
        if (!$title) {
            $img = $xpath->query(".//img", $a)->item(0);
            if ($img) {
                $alt = trim($img->getAttribute('alt'));
                if ($alt) $title = $alt;
            }
        }
        if (!$title) {
            $title = trim($a->textContent);
        }

        if ($title && $url) {
            $results[$url] = [
                'name' => preg_replace('/\s+/', ' ', $title),
                'url'  => $url
            ];
        }
    }

    // If loop anchors failed completely, try another common pattern
    if (empty($results)) {
        $cards = $xpath->query("//div[contains(@class,'product') or contains(@class,'wc-block-grid__product')]");
        foreach ($cards as $card) {
            $a = $xpath->query(".//a[1]", $card)->item(0);
            if (!$a) continue;
            $url = absolute_url($a->getAttribute('href'), $base_url);
            $title = '';
            $h2 = $xpath->query(".//h2|.//h3", $card)->item(0);
            if ($h2) $title = trim($h2->textContent);
            if (!$title) $title = trim($a->textContent);
            if ($title && $url) $results[$url] = ['name' => preg_replace('/\s+/', ' ', $title), 'url' => $url];
        }
    }

    return array_values($results); // deduped by URL
}

function polite_delay_ms($ms) {
    usleep(max(0, intval($ms)) * 1000);
}

// ---------- Controller ----------
$method = $_SERVER['REQUEST_METHOD'] ?? 'GET';
$category_url = $_GET['category_url'] ?? $_POST['category_url'] ?? '';
$pages = intval($_GET['pages'] ?? $_POST['pages'] ?? 1);
$format = strtolower($_GET['format'] ?? $_POST['format'] ?? 'html');

if (!$category_url) {
    // Simple HTML form
    header('Content-Type: text/html; charset=utf-8');
    ?>
    <!doctype html>
    <html lang="fa" dir="rtl">
    <head>
        <meta charset="utf-8">
        <title>استخراج محصولات دسته ووکامرس</title>
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <style>
            body { font-family: sans-serif; max-width: 900px; margin: 2rem auto; padding: 0 1rem; }
            label { display:block; margin-top:1rem; }
            input[type=text], input[type=number] { width:100%; padding:0.6rem; }
            button { margin-top:1rem; padding:0.6rem 1rem; cursor:pointer; }
            .hint { color:#666; font-size:0.9rem; }
            table { border-collapse: collapse; width: 100%; margin-top: 1rem; }
            th, td { border: 1px solid #ccc; padding: 0.5rem; text-align: right; }
            th { background: #f5f5f5; }
            footer { margin-top: 2rem; color:#777; font-size:0.85rem; }
        </style>
    </head>
    <body>
        <h1>استخراج نام و لینک محصولات یک دسته</h1>
        <form method="get">
            <label>لینک دسته‌بندی (مثال: https://korians.com/product-category/clothing/woman-dress/)</label>
            <input type="text" name="category_url" required>

            <label>تعداد صفحات برای جستجو</label>
            <input type="number" name="pages" value="1" min="1" max="100" required>

            <label>فرمت خروجی</label>
            <select name="format">
                <option value="html">HTML (جدول)</option>
                <option value="json">JSON</option>
            </select>

            <button type="submit">شروع</button>
            <p class="hint">لطفاً به قوانین سایت مقصد و robots.txt احترام بگذارید. این ابزار برای تست/تحلیل است.</p>
        </form>
        <footer>Version 1.0 · PHP + cURL + DOM</footer>
    </body>
    </html>
    <?php
    exit;
}

// Validate input
if (!filter_var($category_url, FILTER_VALIDATE_URL)) {
    respond_json(['error' => 'لینک دسته‌بندی معتبر نیست.'], 400);
}
if ($pages < 1 || $pages > 100) {
    respond_json(['error' => 'تعداد صفحات باید بین 1 تا 100 باشد.'], 400);
}

$base = normalize_url($category_url);
$all = [];
$errors = [];
$visited_pages = [];

for ($p = 1; $p <= $pages; $p++) {
    $url_try1 = build_page_url($base, $p);
    [$body, $status, $headers, $err] = fetch_url($url_try1);

    // fallback to ?paged= if /page/ fails
    if (($status >= 400 || !$body) && $p > 1) {
        $sep = (strpos($base, '?') !== false) ? '&' : '?';
        $url_try2 = $base . $sep . 'paged=' . $p;
        [$body2, $status2, $headers2, $err2] = fetch_url($url_try2);
        if ($status2 >= 200 && $status2 < 400 && $body2) {
            $body = $body2; $status = $status2; $headers = $headers2; $err = $err2;
            $visited_pages[] = $url_try2;
        } else {
            $visited_pages[] = $url_try1;
        }
    } else {
        $visited_pages[] = $url_try1;
    }

    if ($err) {
        $errors[] = "Page {$p}: cURL error: {$err}";
        continue;
    }
    if ($status < 200 || $status >= 400 || !$body) {
        $errors[] = "Page {$p}: HTTP status {$status}";
        continue;
    }

    $products = parse_products($body, $base);
    $all = array_merge($all, $products);

    // be polite
    polite_delay_ms(500);
}

// dedupe by URL
$dedup = [];
foreach ($all as $item) {
    $dedup[$item['url']] = $item;
}
$final = array_values($dedup);

if ($format === 'json') {
    respond_json([
        'category_url' => $base,
        'pages_requested' => $pages,
        'pages_visited' => $visited_pages,
        'count' => count($final),
        'products' => $final,
        'errors' => $errors
    ]);
} else {
    header('Content-Type: text/html; charset=utf-8');
    ?>
    <!doctype html>
    <html lang="fa" dir="rtl">
    <head>
        <meta charset="utf-8">
        <title>نتایج استخراج محصولات</title>
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <style>
            body { font-family: sans-serif; max-width: 1100px; margin: 2rem auto; padding: 0 1rem; }
            h1 { margin-bottom: 0.2rem; }
            .sub { color:#666; margin-top:0; }
            table { border-collapse: collapse; width: 100%; margin-top: 1rem; }
            th, td { border: 1px solid #ccc; padding: 0.5rem; text-align: right; }
            th { background: #f5f5f5; }
            .meta { margin-top: 1rem; color:#555; }
            .error { color:#b00; }
            a { word-break: break-all; }
        </style>
    </head>
    <body>
        <h1>نتایج استخراج محصولات</h1>
        <p class="sub">دسته: <a href="<?=h($base)?>" target="_blank" rel="noopener"><?=h($base)?></a> | صفحات درخواست‌شده: <?=h($pages)?></p>

        <table>
            <thead>
                <tr>
                    <th>#</th>
                    <th>نام محصول</th>
                    <th>لینک</th>
                </tr>
            </thead>
            <tbody>
            <?php foreach ($final as $i => $row): ?>
                <tr>
                    <td><?= $i+1 ?></td>
                    <td><?= h($row['name']) ?></td>
                    <td><a href="<?= h($row['url']) ?>" target="_blank" rel="noopener"><?= h($row['url']) ?></a></td>
                </tr>
            <?php endforeach; ?>
            </tbody>
        </table>

        <div class="meta">
            <p>تعداد محصولات: <b><?= count($final) ?></b></p>
            <?php if (!empty($errors)): ?>
                <p class="error">هشدارها:</p>
                <ul>
                    <?php foreach ($errors as $e): ?><li><?= h($e) ?></li><?php endforeach; ?>
                </ul>
            <?php endif; ?>
            <p>صفحات بازدیدشده:</p>
            <ul>
                <?php foreach ($visited_pages as $pv): ?><li><a href="<?= h($pv) ?>" target="_blank" rel="noopener"><?= h($pv) ?></a></li><?php endforeach; ?>
            </ul>
            <p>برای خروجی JSON از پارامتر <code>?format=json</code> استفاده کنید.</p>
        </div>
    </body>
    </html>
    <?php
    exit;
}