<?php
/**
 * MyPhone.ir Category Scraper (PHP 7+)
 * GET params:
 *   - url   : category URL (may include /page/N/)
 *   - pages : integer >=1  (how many pages forward to crawl)
 *
 * Output JSON:
 * {
 *   ok: true,
 *   start_url: "...",
 *   start_page: 2,
 *   pages: 3,
 *   crawled_pages: [".../page/2/",".../page/3/",".../page/4/"],
 *   products: [ { "name":"...", "url":"https://myphone.ir/product/..." }, ... ],
 *   count: 123
 * }
 */

header('Content-Type: application/json; charset=utf-8');
mb_internal_encoding('UTF-8');
ini_set('display_errors', '0');
error_reporting(E_ALL);

// ---------- Input handling (robust to encoded URLs) ----------
$inputUrl = '';
if (isset($_GET['url'])) {
    $inputUrl = urldecode(trim((string)$_GET['url']));
    if (strpos($inputUrl, '%') !== false) {
        $tmp = @rawurldecode($inputUrl);
        if ($tmp) $inputUrl = $tmp;
    }
}
if ($inputUrl === '' && !empty($_SERVER['QUERY_STRING'])) {
    if (preg_match('/(?:^|&)url=([^&]+)\z/i', $_SERVER['QUERY_STRING'], $m)) {
        $inputUrl = rawurldecode($m[1]);
    }
}
$pages = isset($_GET['pages']) ? (int)$_GET['pages'] : 1;
if ($pages < 1) $pages = 1;

$inputUrl = trim($inputUrl);
if ($inputUrl !== '' && !preg_match('#^https?://#i', $inputUrl)) {
    $inputUrl = 'https://' . ltrim($inputUrl, '/');
}
$pu = @parse_url($inputUrl);
$host = isset($pu['host']) ? strtolower($pu['host']) : '';
if ($inputUrl === '' || $host === '') {
    http_response_code(400);
    echo json_encode(['ok'=>false,'error'=>'invalid_or_missing_url','hint'=>'Provide ?url=<encoded myphone.ir category URL>&pages=N']);
    exit;
}
if (!preg_match('/(^|\.)myphone\.ir$/i', $host)) {
    http_response_code(400);
    echo json_encode(['ok'=>false,'error'=>'only_myphone_ir_is_allowed']);
    exit;
}

// ---------- Derive base category URL + start page ----------
list($baseUrl, $startPage) = normalize_category_url_and_start($inputUrl);

// ---------- Crawl ----------
try {
    $seen = [];            // url => true
    $products = [];        // collected items
    $pageUrls = [];        // crawled page urls (for debug)

    for ($i = 0; $i < $pages; $i++) {
        $p = $startPage + $i;
        $pageUrl = ($p === 1) ? remove_trailing_page_segment($baseUrl) : ensure_trailing_slash($baseUrl) . 'page/' . $p . '/';

        $html = fetch_url($pageUrl);
        if ($html === null) {
            // اگر یکی از صفحات در دسترس نبود، ادامه می‌دهیم
            continue;
        }

        $items = parse_category_page($html, $pageUrl);
        foreach ($items as $it) {
            $u = $it['url'];
            if (!isset($seen[$u])) {
                $seen[$u] = true;
                $products[] = $it;
            }
        }
        $pageUrls[] = $pageUrl;
    }

    $out = [
        'ok'           => true,
        'start_url'    => $inputUrl,
        'start_page'   => $startPage,
        'pages'        => $pages,
        'crawled_pages'=> $pageUrls,
        'products'     => $products,
        'count'        => count($products),
    ];
    echo json_encode($out, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_PRETTY_PRINT);

} catch (Throwable $e) {
    http_response_code(500);
    echo json_encode([
        'ok' => false,
        'error' => 'exception',
        'message' => $e->getMessage(),
        'line' => $e->getLine(),
        'file' => basename($e->getFile())
    ], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_PRETTY_PRINT);
}

/* ============================ Helpers ============================ */

function fetch_url($url, $timeout = 25) {
    if (!function_exists('curl_init')) {
        $ctx = stream_context_create([
            'http' => [
                'method'  => 'GET',
                'timeout' => $timeout,
                'header'  => "User-Agent: Mozilla/5.0 (compatible; MyPhoneListScraper/1.0)\r\nAccept-Language: fa-IR,fa;q=0.9,en-US;q=0.8\r\n"
            ]
        ]);
        $body = @file_get_contents($url, false, $ctx);
        return $body !== false ? $body : null;
    }
    $ch = curl_init($url);
    curl_setopt_array($ch, [
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_MAXREDIRS      => 5,
        CURLOPT_TIMEOUT        => $timeout,
        CURLOPT_CONNECTTIMEOUT => 15,
        CURLOPT_SSL_VERIFYPEER => true,
        CURLOPT_SSL_VERIFYHOST => 2,
        CURLOPT_USERAGENT      => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36',
        CURLOPT_HTTPHEADER     => [
            'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language: fa-IR,fa;q=0.9,en-US;q=0.8,en;q=0.7'
        ],
        CURLOPT_REFERER        => 'https://myphone.ir/',
        CURLOPT_ENCODING       => '' // enable gzip/deflate
    ]);
    $body = curl_exec($ch);
    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);
    if ($body === false || $code >= 400) return null;
    return $body;
}

/**
 * Parse product cards from a category page (WooCommerce style)
 * Returns array of ['name' => ..., 'url' => ...]
 */
function parse_category_page($html, $pageUrl) {
    libxml_use_internal_errors(true);
    $dom = new DOMDocument('1.0', 'UTF-8');
    if (stripos($html, '<meta charset=') === false) {
        $html = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />' . $html;
    }
    $dom->loadHTML($html);
    libxml_clear_errors();
    $xp = new DOMXPath($dom);

    $items = [];

    // الگوی رایج ووکامرس: لینک روی کارت محصول
    $nodes = $xp->query('//ul[contains(@class,"products")]//li[contains(@class,"product")]//a[contains(@class,"woocommerce-LoopProduct-link")]');
    foreach ($nodes as $a) {
        /** @var DOMElement $a */
        $href = $a->getAttribute('href');
        $url  = absolutize_url($href, $pageUrl);
        // نام: از h2 یا title موجود در داخل لینک/کارت
        $name = trim($a->textContent);
        if ($name === '') {
            $h2 = $xp->query('.//h2[contains(@class,"woocommerce-loop-product__title")]', $a)->item(0);
            if ($h2) $name = trim($h2->textContent);
        }
        if ($name === '') {
            // fallback: کارت محصول
            $h2 = $xp->query('.//h2[contains(@class,"woocommerce-loop-product__title")]', $a->parentNode)->item(0);
            if ($h2) $name = trim($h2->textContent);
        }
        if ($url) {
            $items[] = ['name' => $name !== '' ? $name : null, 'url' => $url];
        }
    }

    // اگر لینک‌های بالا نبود، fallback عمومی‌
    if (empty($items)) {
        $alt = $xp->query('//ul[contains(@class,"products")]//li[contains(@class,"product")]//a[contains(@href,"/product/")]');
        foreach ($alt as $a) {
            $href = $a->getAttribute('href');
            $url  = absolutize_url($href, $pageUrl);
            $name = trim($a->textContent);
            if ($url) $items[] = ['name'=>$name !== '' ? $name : null, 'url'=>$url];
        }
    }

    // حذف تکراری‌ها
    $seen = [];
    $out = [];
    foreach ($items as $it) {
        $u = normalize_product_url($it['url']);
        if ($u && !isset($seen[$u])) {
            $seen[$u] = true;
            // نام خالی؟ از اسلاگ URL یک اسم حداقلی بساز
            $name = $it['name'];
            if (!$name) $name = guess_name_from_url($u);
            $out[] = ['name'=>$name, 'url'=>$u];
        }
    }
    return $out;
}

/** Normalize category base + extract starting page number */
function normalize_category_url_and_start($url) {
    // اطمینان از اسلش انتهایی برای محاسبات
    $u = ensure_trailing_slash($url);

    // اگر /page/N/ دارد، جدا کن
    if (preg_match('#^(?P<base>.*/)(page/)(?P<n>\d+)/$#i', $u, $m)) {
        $base = $m['base'];
        // ممکن است base خودش already /page/ را شامل کند—ایمن‌سازی:
        $base = preg_replace('#page/\d+/$#i', '', $u);
        return [ $base, max(1, (int)$m['n']) ];
    }
    // بدون /page/ → شروع از 1
    return [ $u, 1 ];
}

/** Remove trailing /page/N/ if present */
function remove_trailing_page_segment($url) {
    $u = ensure_trailing_slash($url);
    return preg_replace('#page/\d+/$#i', '', $u);
}

function ensure_trailing_slash($url) {
    return rtrim($url, " \t\n\r\0\x0B") . (substr($url, -1) === '/' ? '' : '/');
}

function absolutize_url($maybe, $base) {
    if ($maybe === '') return $maybe;
    if (strpos($maybe, '//') === 0) {
        $scheme = parse_url($base, PHP_URL_SCHEME) ?: 'https';
        return $scheme . ':' . $maybe;
    }
    if (parse_url($maybe, PHP_URL_SCHEME)) return $maybe;
    $p = parse_url($base);
    $root = $p['scheme'].'://'.$p['host'] . (isset($p['port']) ? ':'.$p['port'] : '');
    if (strpos($maybe, '/') === 0) return $root . $maybe;
    $dir = $root . rtrim(dirname($p['path']), '/\\') . '/';
    return $dir . $maybe;
}

function normalize_product_url($u) {
    // حذف پارامترهای رهگیری و هشتگ
    $parts = parse_url($u);
    if (!$parts || empty($parts['scheme']) || empty($parts['host'])) return null;
    $scheme = $parts['scheme'];
    $host   = strtolower($parts['host']);
    $path   = isset($parts['path']) ? $parts['path'] : '/';
    if (!preg_match('/(^|\.)myphone\.ir$/i', $host)) return null;
    // فقط مسیرهای product را نگه داریم
    if (strpos($path, '/product/') === false) return null;
    $norm = $scheme . '://' . $host . $path;
    // enforce trailing slash برای ثبات
    if (substr($norm, -1) !== '/') $norm .= '/';
    return $norm;
}

function guess_name_from_url($u) {
    // آخرین اسلاگ آدرس محصول
    $path = parse_url($u, PHP_URL_PATH);
    $slug = trim(basename(rtrim($path, '/')), '/');
    $slug = str_replace(['-', '_'], ' ', $slug);
    return urldecode($slug);
}