<?php
/**
 * product_scraper.php
 * PHP 7.4+
 * ورودی: ?url=<product page>
 * خروجی: JSON شامل اطلاعات محصول (تا هرچه در سورس قابل استخراج باشد)
 */

header('Content-Type: application/json; charset=UTF-8');
mb_internal_encoding('UTF-8');

function respond($code, $payload) {
  http_response_code($code);
  echo json_encode($payload, JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES|JSON_PRETTY_PRINT);
  exit;
}
function ok($data){ respond(200, $data); }
function fail($code,$msg,$extra=[]){ respond($code, array_merge(['ok'=>false,'error'=>$msg], $extra)); }

/* -------------------- ابزار فارسی/نرمال‌سازی -------------------- */
function fa_digits_to_en($s){
  if ($s===null) return null;
  $map = [
    '۰'=>'0','۱'=>'1','۲'=>'2','۳'=>'3','۴'=>'4','۵'=>'5','۶'=>'6','۷'=>'7','۸'=>'8','۹'=>'9',
    '٠'=>'0','١'=>'1','٢'=>'2','٣'=>'3','٤'=>'4','٥'=>'5','٦'=>'6','٧'=>'7','٨'=>'8','٩'=>'9',
  ];
  return strtr($s, $map);
}

function fa_normalize($s){
  if ($s===null) return null;
  $s = trim($s);
  $map = [
    'ي' => 'ی', 'ك' => 'ک', 'ۀ'=>'ه', 'ة'=>'ه', 'ؤ'=>'و', 'إ'=>'ا', 'أ'=>'ا', 'ى'=>'ی',
    '–' => '-', '—' => '-', 'ـ' => '-', '٬' => '،', '’'=>"'",'“'=>'"','”'=>'"',
  ];
  $s = strtr($s, $map);
  // فاصله‌های دور علائم
  $s = preg_replace('/\s*([،,:;؛\-—–|])\s*/u', ' $1 ', $s);
  $s = preg_replace('/\s{2,}/u', ' ', $s);
  // حذف عبارت‌های جنریک انتهای عنوان
  $s = preg_replace('/\s*\|\s*فروشگاه.+$/u', '', $s);
  return trim($s);
}

function parse_price_number($s){
  if ($s===null) return null;
  $s = fa_digits_to_en($s);
  // حذف جداکننده‌ها
  $s = preg_replace('/[^\d\.]/', '', $s);
  if ($s === '') return null;
  // برخی سایت‌ها اعشار ندارند؛ امن: به int
  return (int)round((float)$s, 0);
}

/* -------------------- ورودی و اعتبارسنجی -------------------- */
function get_input_url(){
  $url = $_GET['url'] ?? $_POST['url'] ?? null;
  if(!$url){
    $raw = file_get_contents('php://input');
    if($raw){
      $j = json_decode($raw,true);
      if(json_last_error()===JSON_ERROR_NONE && !empty($j['url'])) $url = $j['url'];
    }
  }
  return $url ? trim($url) : null;
}

function is_valid_url($url){
  if (filter_var($url, FILTER_VALIDATE_URL)) return true;
  $parts = @parse_url($url);
  return $parts && !empty($parts['scheme']) && !empty($parts['host']);
}

/* -------------------- شبکه: واکشی مقاوم -------------------- */
function fetch_html($url, $timeout=20){
  $headers = [
    'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language: fa-IR,fa;q=0.9,en-US;q=0.8,en;q=0.7',
    'Cache-Control: no-cache',
    'Pragma: no-cache',
    'Referer: ' . preg_replace('~(/product/).*~', '$1', $url),
  ];

  $ch = curl_init($url);
  curl_setopt_array($ch, [
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_FOLLOWLOCATION => true,
    CURLOPT_MAXREDIRS      => 7,
    CURLOPT_CONNECTTIMEOUT => $timeout,
    CURLOPT_TIMEOUT        => $timeout,
    CURLOPT_SSL_VERIFYPEER => true,
    CURLOPT_SSL_VERIFYHOST => 2,
    CURLOPT_ENCODING       => '',
    CURLOPT_HTTP_VERSION   => CURL_HTTP_VERSION_2TLS,
    CURLOPT_IPRESOLVE      => CURL_IPRESOLVE_V4,
    CURLOPT_USERAGENT      => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0 Safari/537.36',
    CURLOPT_HTTPHEADER     => $headers,
  ]);
  $body = curl_exec($ch);
  $info = curl_getinfo($ch);
  $errNo= curl_errno($ch);
  $err  = curl_error($ch);
  curl_close($ch);

  if (($body===false || !$body) && $errNo) {
    // fallback به HTTP/1.1
    $ch = curl_init($url);
    curl_setopt_array($ch, [
      CURLOPT_RETURNTRANSFER => true,
      CURLOPT_FOLLOWLOCATION => true,
      CURLOPT_MAXREDIRS      => 7,
      CURLOPT_CONNECTTIMEOUT => $timeout,
      CURLOPT_TIMEOUT        => $timeout,
      CURLOPT_SSL_VERIFYPEER => true,
      CURLOPT_SSL_VERIFYHOST => 2,
      CURLOPT_ENCODING       => '',
      CURLOPT_HTTP_VERSION   => CURL_HTTP_VERSION_1_1,
      CURLOPT_IPRESOLVE      => CURL_IPRESOLVE_V4,
      CURLOPT_USERAGENT      => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0 Safari/537.36',
      CURLOPT_HTTPHEADER     => $headers,
    ]);
    $body2 = curl_exec($ch);
    $info2 = curl_getinfo($ch);
    $errNo2= curl_errno($ch);
    $err2  = curl_error($ch);
    curl_close($ch);

    if ($body2 && !$errNo2 && ($info2['http_code']??0) < 400) {
      return [$body2, $info2, null, null];
    }
    return [null, $info2 ?: $info, $err ?: $err2, $errNo2 ?: $errNo];
  }

  return [$body, $info, $err ?: null, $errNo ?: null];
}

/* -------------------- DOM/XPath -------------------- */
function make_xpath($html){
  libxml_use_internal_errors(true);
  $dom = new DOMDocument();
  $dom->loadHTML($html, LIBXML_NOWARNING|LIBXML_NOERROR);
  libxml_clear_errors();
  return new DOMXPath($dom);
}

/* -------------------- استخراج متا/OG/کنونیکال -------------------- */
function extract_head_meta(DOMXPath $xp){
  $sel = [
    'title'     => '//title',
    'desc'      => '//meta[translate(@name,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")="description"]/@content',
    'og:title'  => '//meta[translate(@property,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")="og:title"]/@content',
    'og:type'   => '//meta[translate(@property,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")="og:type"]/@content',
    'og:url'    => '//meta[translate(@property,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")="og:url"]/@content',
    'og:image'  => '//meta[translate(@property,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")="og:image"]/@content',
    'og:price'  => '//meta[translate(@property,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")="og:price"]/@content',
    'og:avail'  => '//meta[translate(@property,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")="og:availability"]/@content',
    'canonical' => '//link[translate(@rel,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz")="canonical"]/@href',
  ];
  $out = [];
  foreach($sel as $k=>$x){
    $n = $xp->query($x);
    if ($n && $n->length) {
      $val = $k==='title' ? trim($n->item(0)->textContent) : trim($n->item(0)->nodeValue);
      $out[$k] = $val;
    }
  }
  $imgs = $xp->query($sel['og:image']);
  $out['og_images_all'] = [];
  if ($imgs) foreach($imgs as $i) $out['og_images_all'][] = trim($i->nodeValue);
  return $out;
}

/* -------------------- JSON-LD (Product/Offer/…) -------------------- */
function extract_jsonld(DOMXPath $xp){
  $nodes = $xp->query('//script[@type="application/ld+json"]');
  $items=[];
  if($nodes){
    foreach($nodes as $s){
      $txt = trim($s->textContent);
      if($txt==='') continue;
      $try = [ $txt, preg_replace('/\s+\/\/.*$/m','',$txt) ];
      foreach($try as $t){
        $d = json_decode($t, true);
        if (json_last_error()===JSON_ERROR_NONE && $d){
          $stack = [$d];
          while($stack){
            $v = array_pop($stack);
            if (is_array($v)){
              if(isset($v['@type'])) $items[]=$v;
              foreach($v as $vv) if(is_array($vv)) $stack[]=$vv;
            }
          }
          break;
        }
      }
    }
  }
  $out=['Product'=>[],'Offer'=>[],'BreadcrumbList'=>[],'Organization'=>[],'WebPage'=>[],'Other'=>[]];
  foreach($items as $it){
    $t = is_array($it['@type']) ? $it['@type'][0] : $it['@type'];
    if(!isset($out[$t])) $out['Other'][]=$it; else $out[$t][]=$it;
  }
  return $out;
}

/* -------------------- __NEXT_DATA__ -------------------- */
function extract_next_data(DOMXPath $xp){
  $n = $xp->query('//script[@id="__NEXT_DATA__"]');
  if(!$n || !$n->length) return null;
  $txt = trim($n->item(0)->textContent);
  $d = json_decode($txt, true);
  if(json_last_error()!==JSON_ERROR_NONE) return null;

  $store=null; $product=null;
  $it = new RecursiveIteratorIterator(new RecursiveArrayIterator($d), RecursiveIteratorIterator::SELF_FIRST);
  foreach($it as $k=>$v){
    if (in_array($k, ['store','shop','seller','business'], true) && is_array($v)) $store=$v;
    if (is_array($v) && isset($v['id']) && (isset($v['title']) || isset($v['name']))) $product=$v;
  }
  return ['raw'=>$d,'store_guess'=>$store,'product_guess'=>$product];
}

/* -------------------- Heading/Slug نام‌گذاری -------------------- */
function extract_main_heading(DOMXPath $xp){
  $xpaths = [
    '//h1[contains(@class,"product")]',
    '//h1',
    '//h2[contains(@class,"product")]',
    '//h2',
  ];
  foreach ($xpaths as $x) {
    $n = $xp->query($x);
    if ($n && $n->length) {
      $t = trim($n->item(0)->textContent);
      if ($t !== '') return fa_normalize($t);
    }
  }
  return null;
}

function name_from_slug_smart($url){
  $path = parse_url($url, PHP_URL_PATH) ?? '';
  $slug = urldecode(trim(basename($path), "/ \t\n\r\0\x0B"));
  $slug = preg_replace('~^[0-9]+-~u', '', $slug);
  $slug = str_replace(['-','_'], ' ', $slug);
  $slug = preg_replace('/\s{2,}/u', ' ', $slug);
  return fa_normalize($slug);
}

/* -------------------- تصاویر/قیمت/موجودی/ویژگی‌ها -------------------- */
function extract_images(DOMXPath $xp){
  // گالری‌های رایج
  $xpaths = [
    '//div[contains(@class,"gallery")]//img/@src',
    '//div[contains(@class,"product")]//img/@src',
    '//img[contains(@class,"product")]/@src',
    '//meta[@property="og:image"]/@content',
  ];
  $set = [];
  foreach($xpaths as $x){
    $n = $xp->query($x);
    if(!$n) continue;
    foreach($n as $i){
      $u = trim($i->nodeValue);
      if($u) $set[$u]=true;
    }
  }
  return array_values(array_keys($set));
}

function extract_price_and_availability(DOMXPath $xp){
  $priceNodes = [
    '//span[contains(@class,"price") and not(contains(@class,"old"))]',
    '//div[contains(@class,"price") and not(contains(@class,"old"))]',
    '//span[contains(@class,"amount")]',
    '//span[contains(@class,"product-price")]',
    '//span[contains(@class,"price-value")]',
    '//*[contains(@class,"price") and contains(@class,"final")]',
  ];
  $price = null;
  foreach($priceNodes as $x){
    $n = $xp->query($x);
    if($n && $n->length){
      $t = fa_normalize(trim($n->item(0)->textContent));
      $p = parse_price_number($t);
      if($p){ $price = $p; break; }
    }
  }

  // currency حدس
  $currency = null;
  $curHints = [
    '//*[contains(text(),"تومان") or contains(text(),"ریال") or contains(text(),"ریال") or contains(text(),"TL") or contains(text(),"IRR") or contains(text(),"IRT")]',
  ];
  foreach($curHints as $x){
    $n = $xp->query($x);
    if($n && $n->length){
      $t = $n->item(0)->textContent;
      if (mb_stripos($t,'تومان')!==false) { $currency='IRT'; break; }
      if (mb_stripos($t,'ریال')!==false)  { $currency='IRR'; break; }
    }
  }

  // موجودی
  $availability = null;
  $availNodes = [
    '//*[contains(@class,"availability") or contains(@class,"stock") or contains(@class,"status") ]',
    '//*[contains(text(),"ناموجود") or contains(text(),"تمام شد")]',
    '//*[contains(text(),"موجود") or contains(text(),"در انبار")]',
  ];
  foreach($availNodes as $x){
    $n = $xp->query($x);
    if($n && $n->length){
      $t = fa_normalize(trim($n->item(0)->textContent));
      if ($t==='') continue;
      if (mb_stripos($t,'ناموجود')!==false || mb_stripos($t,'تمام شد')!==false){
        $availability = 'out_of_stock'; break;
      }
      if (mb_stripos($t,'موجود')!==false || mb_stripos($t,'در انبار')!==false){
        $availability = 'in_stock'; break;
      }
    }
  }
  return [$price,$currency,$availability];
}

function extract_attributes(DOMXPath $xp){
  // ویژگی‌ها در ul/li یا table
  $pairs = [];

  // شکل جدول: <table class="attributes"><tr><th>ویژگی</th><td>مقدار</td></tr>...
  $rows = $xp->query('//table[contains(@class,"attribute") or contains(@class,"spec") or contains(@class,"tech")]//tr');
  if($rows && $rows->length){
    foreach($rows as $tr){
      $ths = $tr->getElementsByTagName('th');
      $tds = $tr->getElementsByTagName('td');
      if($ths->length && $tds->length){
        $k = fa_normalize(trim($ths->item(0)->textContent));
        $v = fa_normalize(trim($tds->item(0)->textContent));
        if($k!=='' && $v!=='') $pairs[$k]=$v;
      }
    }
  }

  // شکل لیست: <ul class="features"><li>کلید: مقدار</li></ul>
  $lis = $xp->query('//ul[contains(@class,"feature") or contains(@class,"spec") or contains(@class,"attr")]//li');
  if($lis && $lis->length){
    foreach($lis as $li){
      $t = fa_normalize(trim($li->textContent));
      if ($t==='') continue;
      if (strpos($t, ':')!==false || strpos($t, '：')!==false){
        $parts = preg_split('/\s*[:：]\s*/u',$t,2);
        if(count($parts)==2){
          [$k,$v] = $parts;
          if($k!=='' && $v!=='') $pairs[$k]=$v;
        }
      } else {
        // اگر «کلید: مقدار» نبود، به‌عنوان bullet ذخیره کن
        $pairs[] = $t;
      }
    }
  }

  return $pairs;
}

/* -------------------- بردکرامب -------------------- */
function extract_breadcrumb(DOMXPath $xp){
  $res = [];
  // JSON-LD را ترجیح می‌دهیم؛ اگر نبود، از لینک‌ها می‌خوانیم
  // (در build_output اگر JSON-LD باشد همان استفاده می‌شود)
  $nodes = $xp->query('//nav//a | //ul[contains(@class,"breadcrumb")]//a | //ol[contains(@class,"breadcrumb")]//a');
  if ($nodes){
    foreach($nodes as $a){
      $txt = fa_normalize(trim($a->textContent));
      $href= $a->getAttribute('href');
      if($txt && $href) $res[]=['name'=>$txt, 'url'=>$href];
    }
  }
  return $res;
}

/* -------------------- ساخت خروجی -------------------- */
function build_output($url, DOMXPath $xp){
  $meta   = extract_head_meta($xp);
  $jsonld = extract_jsonld($xp);
  $next   = extract_next_data($xp);

  // ترتیب تشخیص نام
  $name = extract_main_heading($xp);
  if (!$name && !empty($jsonld['Product'][0]['name'])) $name = fa_normalize($jsonld['Product'][0]['name']);
  if (!$name) $name = name_from_slug_smart($url);
  if (!$name) {
    if (!empty($meta['og:title'])) $name = fa_normalize($meta['og:title']);
    elseif (!empty($meta['title'])) $name = fa_normalize($meta['title']);
  }
  if (preg_match('~bazarganigeram\.ir~i', $url) && $name && mb_stripos($name,'قیمت و خرید')!==false){
    $name = name_from_slug_smart($url);
  }

  // قیمت/واحد/موجودی از JSON-LD یا OG یا XPath
  $offer = $jsonld['Product'][0]['offers'] ?? $jsonld['Offer'][0] ?? null;
  $price = null; $currency = null; $availability = null;
  if (is_array($offer)){
    $price = $offer['price'] ?? ($offer[0]['price'] ?? null);
    $currency = $offer['priceCurrency'] ?? ($offer[0]['priceCurrency'] ?? null);
    $availability = $offer['availability'] ?? ($offer[0]['availability'] ?? null);
    if (is_string($price)) $price = parse_price_number($price);
  }
  if (!$price && !empty($meta['og:price']) && strtolower($meta['og:price'])!=='undefined'){
    $price = parse_price_number($meta['og:price']);
  }
  if (!$availability && !empty($meta['og:avail'])) $availability = $meta['og:avail'];

  // اگر هنوز خالی بود، از بدنه حدس بزن
  if (!$price || !$availability){
    [$p2,$c2,$a2] = extract_price_and_availability($xp);
    if(!$price && $p2) $price=$p2;
    if(!$currency && $c2) $currency=$c2;
    if(!$availability && $a2) $availability=$a2;
  }

  // تصاویر
  $images = [];
  if (!empty($jsonld['Product'][0]['image'])){
    $images = is_array($jsonld['Product'][0]['image']) ? $jsonld['Product'][0]['image'] : [$jsonld['Product'][0]['image']];
  }
  if (empty($images) && !empty($meta['og_images_all'])) $images = array_merge($images,$meta['og_images_all']);
  if (empty($images)) $images = extract_images($xp);
  $images = array_values(array_unique(array_filter($images)));

  // توضیحات
  $desc = $jsonld['Product'][0]['description'] ?? ($meta['desc'] ?? null);
  $desc = $desc ? fa_normalize($desc) : null;

  // ویژگی‌ها
  $attributes = extract_attributes($xp);

  // SKU/MPN
  $sku = $jsonld['Product'][0]['sku'] ?? null;
  $mpn = $jsonld['Product'][0]['mpn'] ?? null;

  // بردکرامب
  $breadcrumbs = [];
  if (!empty($jsonld['BreadcrumbList'][0]['itemListElement'])){
    foreach ($jsonld['BreadcrumbList'][0]['itemListElement'] as $it) {
      $nm  = $it['name'] ?? ($it['item']['name'] ?? ($it['item']['@id'] ?? null));
      $href= $it['item'] ?? null;
      if (is_array($href)) $href = $href['@id'] ?? ($href['url'] ?? null);
      if ($nm) $breadcrumbs[]=['name'=>fa_normalize($nm),'url'=>$href];
    }
  } else {
    $breadcrumbs = extract_breadcrumb($xp);
  }

  // داده‌های فروشگاه از __NEXT_DATA__
  $store=null;
  if ($next && $next['store_guess']){
    $pick = $next['store_guess'];
    $store = [
      'name'      => $pick['name'] ?? ($pick['title'] ?? null),
      'phone'     => $pick['phone'] ?? null,
      'mobile'    => $pick['mobile'] ?? null,
      'is_open'   => $pick['is_open'] ?? null,
      'instagram' => $pick['instagram'] ?? null,
      'address'   => $pick['address'] ?? null,
    ];
  }

  return [
    'ok'          => true,
    'source_url'  => $url,
    'canonical'   => $meta['canonical'] ?? ($meta['og:url'] ?? null),
    'head'        => [
      'title'       => $meta['title'] ?? null,
      'description' => $meta['desc'] ?? null,
      'og' => [
        'type'         => $meta['og:type'] ?? null,
        'title'        => $meta['og:title'] ?? null,
        'url'          => $meta['og:url'] ?? null,
        'price'        => $meta['og:price'] ?? null,
        'availability' => $meta['og:avail'] ?? null,
        'images'       => $meta['og_images_all'] ?? [],
      ],
    ],
    'product'     => [
      'name'         => $name,
      'description'  => $desc,
      'images'       => $images,
      'sku'          => $sku,
      'mpn'          => $mpn,
      'price'        => $price,
      'currency'     => $currency,
      'availability' => $availability,
      'attributes'   => $attributes,
    ],
    'breadcrumbs' => $breadcrumbs,
    'jsonld'      => $jsonld,        // برای دیباگ/اعتبارسنجی
    'next_data'   => [
      'has_next_data' => (bool)$next,
      'store_guess'   => $store,
    ],
    'fetched_at'  => gmdate('c'),
  ];
}

/* -------------------- اجرا -------------------- */
$url = get_input_url();
if(!$url) fail(400, 'پارامتر url اجباری است. نمونه: ?url=https%3A%2F%2Fbazarganigeram.ir%2Fproduct%2F...');

if(!is_valid_url($url)) fail(400, 'url نامعتبر است. لینک کامل با http(s) بدهید یا آن را URLEncode کنید.');

list($html, $info, $curl_error, $curl_errno) = fetch_html($url);
$http = (int)($info['http_code'] ?? 0);

if(!$html || $http >= 400){
  fail(502, 'خطا در دریافت صفحه', [
    'http_code' => $http,
    'curl_errno'=> $curl_errno,
    'curl_error'=> $curl_error,
    'final_url' => $info['url'] ?? $url,
  ]);
}

$xp = make_xpath($html);
$out = build_output($url, $xp);
ok($out);
