<?php
/**
 * WooCommerce category scraper (name + link) - PHP 7 compatible
 * GET:  scrape_woocommerce.php?url=https://example.com/product-category/guitar/&pages=3
 * POST JSON: {"url":"https://example.com/product-category/guitar/","pages":3}
 */

declare(strict_types=1);
header('Content-Type: application/json; charset=utf-8');

// ---------- DEBUG (optional) ----------
$debug = isset($_GET['debug']) && $_GET['debug'] == '1';
if ($debug) {
    ini_set('display_errors', '1');
    ini_set('display_startup_errors', '1');
    error_reporting(E_ALL);
}

// ---------- helpers ----------
function respond($ok, $payload = [], $code = 200) {
    http_response_code($code);
    echo json_encode(array_merge(['ok'=>$ok], $payload), JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES);
    exit;
}

function get_input(): array {
    $raw = file_get_contents('php://input');
    if (!empty($raw)) {
        $data = json_decode($raw, true);
        if (json_last_error() === JSON_ERROR_NONE && is_array($data)) return $data;
    }
    return [
        'url'   => isset($_GET['url']) ? $_GET['url'] : null,
        'pages' => isset($_GET['pages']) ? $_GET['pages'] : null,
    ];
}

// Polyfills for PHP7
function starts_with($haystack, $needle) {
    return $needle === '' || strpos($haystack, $needle) === 0;
}
function ends_with($haystack, $needle) {
    if ($needle === '') return true;
    $len = strlen($haystack) - strlen($needle);
    return $len >= 0 && strpos($haystack, $needle, $len) !== false;
}

function normalize_base_url($url) {
    $url = trim($url);
    if (!preg_match('#^https?://#i', $url)) $url = 'https://' . ltrim($url, '/');
    if (!ends_with($url, '/')) $url .= '/';
    return $url;
}
function build_page_url($baseUrl, $page) {
    if ($page <= 1) return $baseUrl;
    return rtrim($baseUrl, '/') . '/page/' . $page . '/';
}

function fetch_url($url, $timeout = 20) {
    if (!function_exists('curl_init')) {
        return ['status'=>0, 'error'=>'cURL extension not available', 'html'=>null];
    }
    $ch = curl_init($url);
    curl_setopt_array($ch, [
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_MAXREDIRS      => 5,
        CURLOPT_TIMEOUT        => $timeout,
        CURLOPT_CONNECTTIMEOUT => $timeout,
        CURLOPT_ENCODING       => '',
        CURLOPT_USERAGENT      => 'Mozilla/5.0 (compatible; WooScraper/1.0)',
        CURLOPT_HTTPHEADER     => ['Accept: text/html,application/xhtml+xml;q=0.9,*/*;q=0.8'],
        CURLOPT_SSL_VERIFYPEER => true,
        CURLOPT_SSL_VERIFYHOST => 2,
    ]);
    $html = curl_exec($ch);
    $err  = curl_error($ch);
    $status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);
    return ['status'=>$status, 'error'=>$err, 'html'=>$html];
}

function resolve_url($href, $base) {
    if (preg_match('#^https?://#i', $href)) return $href;
    if (starts_with($href, '//')) {
        $scheme = parse_url($base, PHP_URL_SCHEME) ?: 'https';
        return $scheme . ':' . $href;
    }
    $p = parse_url($base);
    $scheme = isset($p['scheme']) ? $p['scheme'] : 'https';
    $host   = isset($p['host']) ? $p['host'] : '';
    $port   = isset($p['port']) ? ':' . $p['port'] : '';
    $basePath = isset($p['path']) ? $p['path'] : '/';
    if (!ends_with($basePath, '/')) {
        $basePath = preg_replace('#/[^/]*$#', '/', $basePath);
        if ($basePath === null || $basePath === '') $basePath = '/';
    }
    if (starts_with($href, '/')) $path = $href;
    else $path = $basePath . $href;

    $segments = [];
    foreach (explode('/', $path) as $seg) {
        if ($seg === '' || $seg === '.') continue;
        if ($seg === '..') { array_pop($segments); continue; }
        $segments[] = $seg;
    }
    $final = '/' . implode('/', $segments);
    if (ends_with($path, '/')) $final .= '/';
    return $scheme . '://' . $host . $port . $final;
}

function extract_products($html, $pageUrl) {
    if (!class_exists('DOMDocument')) {
        return ['_error' => 'DOM extension not available'];
    }
    $doc = new DOMDocument();
    // جلوگیری از هشدارهای HTML
    libxml_use_internal_errors(true);
    @$doc->loadHTML($html);
    libxml_clear_errors();
    $xp = new DOMXPath($doc);

    $items = [];
    $nodes = $xp->query("//ul[contains(concat(' ', normalize-space(@class), ' '), ' products ')]//li[contains(concat(' ', normalize-space(@class), ' '), ' product ')]");
    foreach ($nodes as $li) {
        $a = $xp->query(".//a[contains(concat(' ', normalize-space(@class), ' '), ' woocommerce-LoopProduct-link ')]", $li)->item(0);
        if (!$a) $a = $xp->query(".//a[1]", $li)->item(0);
        if (!$a) continue;

        $href = $a->getAttribute('href');
        $name = trim($a->textContent);

        if ($name === '') {
            $h = $xp->query(".//h2|.//h3", $li)->item(0);
            if ($h) $name = trim($h->textContent);
        }
        if ($href === '' || $name === '') continue;

        $url = resolve_url($href, $pageUrl);
        $name = preg_replace('/\s+/u', ' ', $name);
        $items[$url] = ['name'=>$name, 'url'=>$url]; // de-dup
    }
    return array_values($items);
}

// ---------- main ----------
try {
    $in = get_input();
    $baseUrl = isset($in['url']) ? (string)$in['url'] : '';
    $pages   = isset($in['pages']) ? (int)$in['pages'] : 0;

    // validate URL
    $testUrl = starts_with($baseUrl, 'http') ? $baseUrl : 'https://' . ltrim($baseUrl, '/');
    if (!$baseUrl || !filter_var($testUrl, FILTER_VALIDATE_URL)) {
        respond(false, ['error' => 'پارامتر url نامعتبر است. نمونه: https://example.com/product-category/guitar/'], 400);
    }
    if ($pages < 1 || $pages > 200) {
        respond(false, ['error' => 'پارامتر pages باید بین 1 تا 200 باشد.'], 400);
    }

    if (!function_exists('curl_init')) {
        respond(false, ['error' => 'اکستنشن cURL روی سرور فعال نیست.'], 500);
    }
    if (!class_exists('DOMDocument')) {
        respond(false, ['error' => 'اکستنشن DOM/XML روی سرور فعال نیست.'], 500);
    }

    $baseUrl = normalize_base_url($baseUrl);

    $seen = [];
    $warnings = [];
    for ($p = 1; $p <= $pages; $p++) {
        $pageUrl = build_page_url($baseUrl, $p);
        $resp = fetch_url($pageUrl, 25);
        if (!empty($resp['error']) || $resp['status'] >= 400 || empty($resp['html'])) {
            $warnings[] = ['page'=>$p, 'url'=>$pageUrl, 'status'=>$resp['status'], 'error'=>$resp['error']];
            continue;
        }
        $items = extract_products($resp['html'], $pageUrl);
        // اگر به‌جای آرایه محصولات، خطا برگشت
        if (isset($items['_error'])) {
            $warnings[] = ['page'=>$p, 'url'=>$pageUrl, 'status'=>$resp['status'], 'error'=>$items['_error']];
            continue;
        }
        foreach ($items as $it) { $seen[$it['url']] = $it; }
        usleep(200000); // 200ms
    }

    $products = array_values($seen);
    $out = [
        'category_url' => $baseUrl,
        'pages'        => $pages,
        'count'        => count($products),
        'products'     => $products,
        'warnings'     => $warnings
    ];
    if ($debug) $out['_php'] = phpversion();

    respond(true, $out);

} catch (Throwable $e) {
    respond(false, [
        'error' => 'خطای داخلی سرور',
        'details' => $e->getMessage(),
        'trace' => $debug ? $e->getTrace() : null
    ], 500);
}