<?php
/**
 * Sornashop search scraper (name + link) with /page/{n}/ pagination
 * Usage example:
 * sorna_api.php?url=https://www.sornashop.com/shop/?s=%DB%8C%D8%A7%D9%85%D8%A7%D9%87%D8%A7&post_type=product&orderby=price-desc&start=1&pages=3
 *
 * Notes:
 * - Automatically percent-encodes Persian query params (RFC3986).
 * - Paginates by inserting /page/{n}/ before the query string.
 * - Extracts product title + link using multiple selectors (Woodmart/WooCommerce).
 */

header('Content-Type: application/json; charset=UTF-8');
mb_internal_encoding('UTF-8');

// ========== Utilities ==========
function rebuild_url(array $parts): string {
    $scheme = isset($parts['scheme']) ? $parts['scheme'] . '://' : '';
    $host   = $parts['host']   ?? '';
    $port   = isset($parts['port']) ? ':' . $parts['port'] : '';
    $path   = $parts['path']   ?? '/';
    $query  = isset($parts['query']) && $parts['query'] !== '' ? '?' . $parts['query'] : '';
    $frag   = isset($parts['fragment']) ? '#' . $parts['fragment'] : '';
    return $scheme . $host . $port . $path . $query . $frag;
}

/** Percent-encode all query params (handles Persian terms like یاماها) */
function encode_query_params(string $url): string {
    $parts = parse_url($url);
    if (!isset($parts['query'])) return $url;

    parse_str($parts['query'], $qarr);
    // RFC3986 encoding -> spaces become %20, Persian gets %XX
    $parts['query'] = http_build_query($qarr, '', '&', PHP_QUERY_RFC3986);
    return rebuild_url($parts);
}

/** Build /page/{n}/ URL, preserving query string */
function build_paged_url(string $baseUrl, int $page): string {
    if ($page <= 1) return encode_query_params($baseUrl);

    $parts  = parse_url($baseUrl);
    $scheme = $parts['scheme'] ?? 'https';
    $host   = $parts['host'] ?? '';
    $path   = rtrim($parts['path'] ?? '/', '/'); // e.g. /shop
    $query  = '';

    if (isset($parts['query'])) {
        parse_str($parts['query'], $qarr);
        $query = http_build_query($qarr, '', '&', PHP_QUERY_RFC3986);
        $query = $query !== '' ? '?' . $query : '';
    }
    return $scheme . '://' . $host . $path . '/page/' . intval($page) . '/' . $query;
}

/** Fetch HTML with cURL */
function fetch_html(string $url, bool $debug = false): string {
    // Ensure query params are encoded before request
    $url = encode_query_params($url);

    $ch = curl_init($url);
    curl_setopt_array($ch, [
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_MAXREDIRS      => 5,
        CURLOPT_CONNECTTIMEOUT => 15,
        CURLOPT_TIMEOUT        => 25,
        CURLOPT_SSL_VERIFYPEER => true,
        CURLOPT_SSL_VERIFYHOST => 2,
        CURLOPT_HTTPHEADER     => [
            'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome Safari/537.36',
            'Accept-Language: fa-IR,fa;q=0.9,en;q=0.8',
            'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Referer: https://www.sornashop.com/'
        ],
    ]);
    $html  = curl_exec($ch);
    $err   = curl_error($ch);
    $code  = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);

    if ($html === false || $code >= 400) {
        $msg = "HTTP $code while fetching: $url";
        if ($debug && $err) { $msg .= " | $err"; }
        throw new Exception($msg);
    }
    if (!mb_detect_encoding($html, 'UTF-8', true)) {
        $html = mb_convert_encoding($html, 'UTF-8', 'auto');
    }
    return $html;
}

/** Extract products (name + url) using multiple robust selectors */
function extract_products(string $html): array {
    libxml_use_internal_errors(true);
    $dom = new DOMDocument();
    // Help DOMDocument treat content as UTF-8
    @$dom->loadHTML('<?xml encoding="UTF-8">' . $html);
    libxml_clear_errors();
    $xp = new DOMXPath($dom);

    $out = [];

    // Containers: either <div class="products ..."> or <ul class="products ...">
    $containers = $xp->query(
        "//div[contains(concat(' ',normalize-space(@class),' '),' products ')]" .
        " | //ul[contains(concat(' ',normalize-space(@class),' '),' products ')]"
    );
    if ($containers->length === 0) {
        // Fallback: scan whole document if not found
        $containers = new ArrayObject([$dom->documentElement]);
    }

    foreach ($containers as $c) {
        // Cards: Woodmart sometimes uses div.product-grid-item, Woo uses li.product
        $cards = $xp->query(
            ".//div[contains(@class,'product-grid-item') and contains(@class,'product')]" .
            " | .//li[contains(@class,'product')]",
            $c
        );

        foreach ($cards as $card) {
            // Try multiple common title/link patterns
            $a = $xp->query(
                ".//h3[contains(@class,'product-title')]/a" .
                " | .//a[contains(@class,'product-title')]" .
                " | .//a[contains(@class,'wd-entities-title')]" .
                " | .//a[contains(@class,'product-link')]" .
                " | .//a[contains(@href,'/product/')]" // last resort
                , $card
            )->item(0);

            if (!$a || !$a->hasAttribute('href')) { continue; }

            $name = trim(preg_replace('/\s+/u', ' ', $a->textContent));
            if ($name === '') {
                $h = $xp->query(".//h2|.//h3", $card)->item(0);
                if ($h) { $name = trim(preg_replace('/\s+/u', ' ', $h->textContent)); }
            }
            $href = trim($a->getAttribute('href'));
            if ($name === '' || $href === '') { continue; }

            // Absolutize relative URLs
            if (strpos($href, 'http') !== 0) {
                $href = (substr($href, 0, 1) === '/')
                    ? 'https://www.sornashop.com' . $href
                    : 'https://www.sornashop.com/' . ltrim($href, '/');
            }
            $out[$href] = ['name' => $name, 'url' => $href]; // de-dup by URL
        }
    }
    return array_values($out);
}

// ========== Main ==========
try {
    $url    = isset($_GET['url'])   ? trim($_GET['url'])   : '';
    $start  = isset($_GET['start']) ? max(1, (int)$_GET['start']) : 1;
    $pages  = isset($_GET['pages']) ? max(1, (int)$_GET['pages']) : 1;
    $debug  = isset($_GET['debug']) && $_GET['debug'] == '1';

    if ($url === '') {
        http_response_code(400);
        echo json_encode(['ok' => false, 'error' => "Missing 'url' param"], JSON_UNESCAPED_UNICODE);
        exit;
    }

    $all = [];
    for ($p = $start; $p < $start + $pages; $p++) {
        $pageUrl = ($p === 1) ? $url : build_paged_url($url, $p);
        $html    = fetch_html($pageUrl, $debug);
        $items   = extract_products($html);
        foreach ($items as $it) { $all[$it['url']] = $it; } // de-dup
        usleep(150000); // be polite (150ms)
    }

    echo json_encode([
        'ok'       => true,
        'input'    => ['url' => $url, 'start' => $start, 'pages' => $pages],
        'count'    => count($all),
        'products' => array_values($all)
    ], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_PRETTY_PRINT);

} catch (Throwable $e) {
    http_response_code(500);
    echo json_encode([
        'ok'    => false,
        'error' => $e->getMessage()
    ], JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);
}