<?php
/**
 * WooCommerce Product Scraper -> JSON (PHP 7+)
 * Author: You
 * Usage: /scrape.php?url=<PRODUCT_URL>
 * Output: JSON { ok, fetched_at, url_input, url, name, sku, brand, prices{...}, features{...}, images[], description{html,text} }
 *
 * Notes:
 * - لینک‌های فارسی/یونیکد پشتیبانی می‌شود (نرمال‌سازی: Punycode + rawurlencode).
 * - برای دامنه‌های IDN بهتر است ext-intl فعال باشد (idn_to_ascii).
 * - لطفاً قوانین سایت مقصد و robots.txt را رعایت کنید.
 */

ini_set('display_errors', '0');
error_reporting(E_ALL);

header('Content-Type: application/json; charset=UTF-8');
header('Access-Control-Allow-Origin: *'); // در صورت نیاز محدود کنید
header('Access-Control-Allow-Methods: GET, OPTIONS');
header('Access-Control-Allow-Headers: Content-Type');

if ($_SERVER['REQUEST_METHOD'] === 'OPTIONS') {
    http_response_code(204);
    exit;
}

mb_internal_encoding('UTF-8');

try {
    // ------- ورودی -------
    $rawUrl = isset($_GET['url']) ? trim($_GET['url']) : '';
    if ($rawUrl === '') {
        throw new Exception('پارامتر url نامعتبر است.');
    }

    // نرمال‌سازی لینک ورودی (یونیکد -> ASCII-safe)
    $url = normalize_input_url($rawUrl);
    if (!filter_var($url, FILTER_VALIDATE_URL)) {
        throw new Exception('پارامتر url نامعتبر است.');
    }

    // ------- دریافت HTML -------
    $html = fetch_html($url);
    if (!$html) {
        throw new Exception('قادر به دریافت محتوای صفحه نیستم.');
    }

    // نرمال‌سازی انکدینگ به UTF-8
    $enc = mb_detect_encoding($html, ['UTF-8','Windows-1256','ISO-8859-6','ISO-8859-1','ASCII'], true);
    if ($enc && strtoupper($enc) !== 'UTF-8') {
        $html = mb_convert_encoding($html, 'UTF-8', $enc);
    }

    // ------- DOM/XPath -------
    $dom = new DOMDocument();
    libxml_use_internal_errors(true);
    $dom->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
    libxml_clear_errors();
    $xp = new DOMXPath($dom);

    // ===== نام محصول =====
    $name = firstText($xp, [
        "//h1[contains(@class,'product_title') or contains(@class,'entry-title')]",
        "//meta[@property='og:title']/@content",
        "//title",
    ]);

    // ===== SKU =====
    $sku = firstText($xp, [
        "//span[contains(@class,'sku')]",
        "//div[contains(@class,'product_meta')]//span[contains(@class,'sku')]",
        "//tr[th[contains(normalize-space(.), 'SKU') or contains(normalize-space(.), 'اسک')]]/td",
    ]);

    // ===== Brand =====
    $brand = firstText($xp, [
        "//tr[th[contains(., 'برند') or contains(., 'Brand')]]/td",
        "//div[contains(@class,'product-brands')]//a",
        "//span[contains(@class,'posted_in')]/a[contains(@href,'brand') or contains(@href,'برند')]",
        "(//nav[contains(@class,'breadcrumb') or contains(@class,'woocommerce-breadcrumb')]//a)[last()-1]",
    ]);
    $brand = cleanBrand($brand);

    // ===== Prices =====
    // WooCommerce: قیمت فروش در <ins> و قیمت قبلی در <del>
    $saleAmount = firstText($xp, [
        "//p[contains(@class,'price')]//ins//*[contains(@class,'amount') or self::text()]",
        "//div[contains(@class,'summary')]//ins//*[contains(@class,'amount') or self::text()]",
        "//p[contains(@class,'price')]//*[contains(@class,'amount')][1]",
    ]);
    $regularAmount = firstText($xp, [
        "//p[contains(@class,'price')]//del//*[contains(@class,'amount') or self::text()]",
        "//div[contains(@class,'summary')]//del//*[contains(@class,'amount') or self::text()]",
        "(//p[contains(@class,'price')]//*[contains(@class,'amount')])[2]",
    ]);
    $currency = firstText($xp, [
        "//meta[@property='product:price:currency']/@content",
        "//meta[@property='og:price:currency']/@content",
    ]);
    if (!$currency) {
        $joined = ($saleAmount ?? '') . ' ' . ($regularAmount ?? '');
        if (preg_match('/تومان/u', $joined)) $currency = 'TOMAN';
        elseif (preg_match('/ریال/u', $joined)) $currency = 'IRR';
        else $currency = null;
    }
    $priceUpdatedAt = firstText($xp, [
        "//text()[contains(., 'به\u200Cروزرسانی قیمت') or contains(., 'به‌روزرسانی قیمت') or contains(., 'آخرین بروزرسانی قیمت')]/parent::*",
        "//small[contains(., 'به\u200Cروزرسانی') or contains(., 'به‌روزرسانی')]",
    ]);
    $prices = [
        'sale_raw'    => cleanMoney($saleAmount),
        'regular_raw' => cleanMoney($regularAmount),
        'sale'        => normalizePrice($saleAmount),
        'regular'     => normalizePrice($regularAmount),
        'currency'    => $currency,
        'updated_at'  => normalizeDateText($priceUpdatedAt),
    ];

    // ===== Features (Specs) =====
    $features = [];

    // جدول‌های th/td
    foreach ($xp->query("//table[.//th]//tr") as $tr) {
        $th = $xp->query(".//th", $tr)->item(0);
        $td = $xp->query(".//td", $tr)->item(0);
        if ($th && $td) {
            $k = trimAll($th->textContent);
            $v = trimAll(nodeTextWithBreaks($td));
            if ($k !== '' && $v !== '') {
                $features[$k] = $v;
            }
        }
    }

    // لیست‌های «کلید: مقدار»
    foreach ($xp->query("//ul/li") as $li) {
        $txt = trimAll($li->textContent);
        if ($txt === '') continue;
        if (strpos($txt, ':') !== false || mb_strpos($txt, '：') !== false) {
            list($k, $v) = mb_explode_first([':', '：'], $txt);
            $k = trimAll($k); $v = trimAll($v);
            if ($k !== '' && $v !== '') $features[$k] = $v;
        }
    }

    // ===== Images =====
    $abs = function($link) use ($url) { return make_absolute_url($url, $link); };
    $imgUrls = [];

    // OG images
    foreach ($xp->query("//meta[@property='og:image']/@content") as $n) {
        $imgUrls[] = $n->nodeValue;
    }

    // گالری ووکامرس
    foreach ($xp->query("//div[contains(@class,'woocommerce-product-gallery')]//img") as $img) {
        foreach (['data-large_image','data-src','data-original','src'] as $attr) {
            if ($img->hasAttribute($attr)) {
                $imgUrls[] = $img->getAttribute($attr);
            }
        }
        if ($img->hasAttribute('srcset')) {
            $parts = preg_split('/\s*,\s*/', $img->getAttribute('srcset'));
            foreach ($parts as $p) {
                $u = trim(preg_replace('/\s+\d+[wx]$/', '', $p));
                if ($u) $imgUrls[] = $u;
            }
        }
    }

    // تصاویر داخل توضیحات
    foreach ($xp->query("//div[contains(@class,'woocommerce-Tabs-panel--description')]//img | //div[@id='tab-description']//img") as $img) {
        if ($img->hasAttribute('src')) $imgUrls[] = $img->getAttribute('src');
    }

    // نرمال‌سازی و یکتاسازی
    $imgUrls = array_values(array_unique(array_map(function($u) use ($abs){
        return $abs(trim($u));
    }, array_filter($imgUrls))));

    // ===== Description (HTML + Text) =====
    $descNode = firstNode($xp, [
        "//div[contains(@class,'woocommerce-Tabs-panel--description')]",
        "//div[@id='tab-description']",
        "//div[contains(@class,'product-short-description')]",
    ]);
    $description_html  = $descNode ? sanitizeInnerHTML($descNode) : null;
    $description_text  = $descNode ? trimAll(nodeTextWithBreaks($descNode)) : null;

    // ===== خروجی =====
    $out = [
        'ok'         => true,
        'fetched_at' => gmdate('c'),
        'url_input'  => $rawUrl,   // ورودی خام کاربر
        'url'        => $url,      // URL نرمال‌شده (ASCII)
        'name'       => nullIfEmpty($name),
        'sku'        => nullIfEmpty($sku),
        'brand'      => nullIfEmpty($brand),
        'prices'     => $prices,
        'features'   => (object)$features,
        'images'     => $imgUrls,
        'description'=> [
            'html' => $description_html,
            'text' => $description_text
        ]
    ];

    echo json_encode($out, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_PRETTY_PRINT);
    exit;

} catch (Throwable $e) {
    http_response_code(400);
    echo json_encode([
        'ok' => false,
        'error' => $e->getMessage(),
    ], JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_PRETTY_PRINT);
    exit;
}

/* ========================= Helpers ========================= */

function fetch_html($url) {
    $ch = curl_init();
    $headers = [
        'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language: fa-IR,fa;q=0.9,en-US;q=0.8,en;q=0.7',
        'Cache-Control: no-cache',
        'Pragma: no-cache',
        'Connection: keep-alive',
        'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123 Safari/537.36'
    ];
    curl_setopt_array($ch, [
        CURLOPT_URL => $url,
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_MAXREDIRS => 5,
        CURLOPT_CONNECTTIMEOUT => 15,
        CURLOPT_TIMEOUT => 30,
        CURLOPT_HTTPHEADER => $headers,
        CURLOPT_ENCODING => 'gzip',
        CURLOPT_SSL_VERIFYPEER => true,
        CURLOPT_SSL_VERIFYHOST => 2,
    ]);
    $res = curl_exec($ch);
    if ($res === false) {
        $err = curl_error($ch);
        curl_close($ch);
        throw new Exception('cURL error: ' . $err);
    }
    $code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    curl_close($ch);
    if ($code >= 400) {
        throw new Exception('HTTP error code: ' . $code);
    }
    return $res;
}

function normalize_input_url($input) {
    $u = trim($input);

    // اگر scheme ندارد، https اضافه کن
    if (!preg_match('#^[a-z][a-z0-9+\-.]*://#i', $u)) {
        $u = 'https://' . $u;
    }

    $parts = @parse_url($u);
    if ($parts === false) {
        $u = preg_replace('/\s+/u', '', $u);
        $parts = @parse_url($u);
        if ($parts === false) {
            return $u;
        }
    }

    $scheme   = $parts['scheme'] ?? 'http';
    $host     = $parts['host']   ?? '';
    $user     = $parts['user']   ?? null;
    $pass     = $parts['pass']   ?? null;
    $port     = isset($parts['port']) ? ':' . $parts['port'] : '';
    $path     = $parts['path']   ?? '';
    $query    = $parts['query']  ?? null;
    $fragment = $parts['fragment'] ?? null;

    // هاست یونیکد → ACE (Punycode)
    if ($host !== '' && function_exists('idn_to_ascii')) {
        $host_idn = idn_to_ascii($host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46);
        if ($host_idn !== false) {
            $host = $host_idn;
        }
    }

    // user/pass (در صورت وجود)
    $auth = '';
    if ($user !== null) {
        $auth = rawurlencode(rawurldecode($user));
        if ($pass !== null) {
            $auth .= ':' . rawurlencode(rawurldecode($pass));
        }
        $auth .= '@';
    }

    // path سگمنتی
    $encPath = implode('/', array_map(function($seg){
        $seg = rawurldecode($seg);
        return rawurlencode($seg);
    }, explode('/', $path)));

    // query
    $encQuery = '';
    if ($query !== null && $query !== '') {
        $pairs = [];
        foreach (explode('&', $query) as $pair) {
            if ($pair === '') continue;
            $kv = explode('=', $pair, 2);
            $k = rawurlencode(rawurldecode($kv[0]));
            $v = isset($kv[1]) ? rawurlencode(rawurldecode($kv[1])) : '';
            $pairs[] = $k . '=' . $v;
        }
        if ($pairs) $encQuery = '?' . implode('&', $pairs);
    }

    // fragment
    $encFrag = '';
    if ($fragment !== null && $fragment !== '') {
        $encFrag = '#' . rawurlencode(rawurldecode($fragment));
    }

    return $scheme . '://' . $auth . $host . $port . $encPath . $encQuery . $encFrag;
}

function firstText(DOMXPath $xp, array $xpaths) {
    foreach ($xpaths as $q) {
        $nodes = $xp->query($q);
        if ($nodes && $nodes->length) {
            $n = $nodes->item(0);
            $val = ($n instanceof DOMAttr) ? $n->value : $n->textContent;
            $val = trimAll($val);
            if ($val !== '') return $val;
        }
    }
    return null;
}

function firstNode(DOMXPath $xp, array $xpaths) {
    foreach ($xpaths as $q) {
        $nodes = $xp->query($q);
        if ($nodes && $nodes->length) {
            return $nodes->item(0);
        }
    }
    return null;
}

function sanitizeInnerHTML(DOMNode $node) {
    $doc = $node->ownerDocument;
    $html = '';
    foreach ($node->childNodes as $child) {
        $html .= $doc->saveHTML($child);
    }
    $html = trim($html);
    // حذف script/style/iframe برای خروجی تمیز
    $html = preg_replace('#<(script|style|iframe)\b[^>]*>.*?</\1>#is', '', $html);
    return $html;
}

function nodeTextWithBreaks(DOMNode $node) {
    $html = sanitizeInnerHTML($node);
    $text = preg_replace('#<br\s*/?>#i', "\n", $html);
    $text = strip_tags($text);
    return $text;
}

function trimAll($s) {
    if ($s === null) return '';
    $s = preg_replace('/\x{200C}|\x{200F}|\x{200E}/u', '', $s); // ZWNJ/LRM/RLM
    $s = preg_replace("/\s+/u", ' ', $s);
    return trim($s);
}

function nullIfEmpty($s) {
    $s = trimAll((string)$s);
    return $s === '' ? null : $s;
}

function cleanMoney($s) {
    if ($s === null) return null;
    $s = preg_replace('/\s*(تومان|ريال|ریال|IRT|IRR)\s*/u', '', $s);
    $s = preg_replace('/[^\d۰-۹٠-٩٬,\.]/u', '', $s);
    return trim($s);
}

function normalizePrice($s) {
    if ($s === null) return null;
    $s = fa_to_en_digits($s);
    $s = str_replace(['٬', ',', ' '], '', $s);
    if ($s === '' || !preg_match('/^\d+$/', $s)) return null;
    return (int)$s;
}

function normalizeDateText($t) {
    if (!$t) return null;
    return trimAll($t);
}

function fa_to_en_digits($s) {
    $fa = ['۰','۱','۲','۳','۴','۵','۶','۷','۸','۹','٠','١','٢','٣','٤','٥','٦','٧','٨','٩'];
    $en = ['0','1','2','3','4','5','6','7','8','9','0','1','2','3','4','5','6','7','8','9'];
    return str_replace($fa, $en, $s);
}

function mb_explode_first(array $delims, $text) {
    foreach ($delims as $d) {
        $pos = mb_strpos($text, $d);
        if ($pos !== false) {
            $left = mb_substr($text, 0, $pos);
            $right = mb_substr($text, $pos + mb_strlen($d));
            return [$left, $right];
        }
    }
    return [$text, ''];
}

function cleanBrand($brand) {
    if (!$brand) return null;
    $brand = trimAll($brand);
    $brand = preg_replace('/^(برند|Brand|دسته)\s*[:：]\s*/ui', '', $brand);
    return $brand ?: null;
}

function make_absolute_url($base, $rel) {
    if (!$rel) return null;
    if (preg_match('#^https?://#i', $rel)) return $rel;

    $p = parse_url($base);
    if (!$p) return $rel;

    $scheme = isset($p['scheme']) ? $p['scheme'] : 'http';
    $host   = isset($p['host']) ? $p['host'] : '';
    $port   = isset($p['port']) ? ':' . $p['port'] : '';
    $path   = isset($p['path']) ? $p['path'] : '/';

    if (substr($rel, 0, 1) === '/') {
        $path = $rel;
    } else {
        $path = preg_replace('#/[^/]*$#', '/', $path);
        $path = $path . $rel;
    }

    $segments = [];
    foreach (explode('/', $path) as $seg) {
        if ($seg === '' || $seg === '.') continue;
        if ($seg === '..') array_pop($segments);
        else $segments[] = $seg;
    }
    $path = '/' . implode('/', $segments);

    return $scheme . '://' . $host . $port . $path;
}