<?php
/**
 * Digikala Category Scraper via Sitemap (Progress + Resume)
 * PHP 7.4+
 * Run in browser: /digikala-categories-progress.php
 * Resume: add ?resume=1
 *
 * Outputs: categories.json + categories.csv
 * Keeps state: categories_state.json (for resume)
 */

@set_time_limit(0);
@ini_set('memory_limit','512M');

const ROBOTS_URL    = 'https://www.digikala.com/robots.txt';
const UA            = 'DigiMasterCategoryBot/1.1 (+https://example.com)';
const FETCH_TITLES  = false;     // اگر true شود، عنوان <h1> هر دسته را هم واکشی می‌کند (کندتر)
const MAX_GZ        = 200;       // سقف تعداد gz برای احتیاط
const DELAY_US      = 120000;    // تأخیر بین درخواست‌ها (µs) مودبانه‌تر
const STATE_FILE    = __DIR__ . '/categories_state.json';
const OUT_JSON      = __DIR__ . '/categories.json';
const OUT_CSV       = __DIR__ . '/categories.csv';
const LOG_FILE      = __DIR__ . '/categories_progress.log';

/* ---------------- UI ---------------- */
header('Content-Type: text/html; charset=utf-8');
?>
<!doctype html><html lang="fa" dir="rtl"><meta charset="utf-8">
<title>استخراج دسته‌های دیجی‌کالا – با پیشرفت</title>
<style>
body{font-family:system-ui,Segoe UI,Roboto,Arial,sans-serif;max-width:960px;margin:24px auto;padding:0 16px;background:#f8fafc;color:#0f172a}
h1{margin:.2em 0 .6em}
.card{background:#fff;border:1px solid #e5e7eb;border-radius:12px;padding:16px;margin:12px 0;box-shadow:0 6px 22px rgba(15,23,42,.04)}
.progress-wrap{display:flex;align-items:center;gap:10px}
.track{flex:1;height:14px;background:#eef2ff;border-radius:999px;overflow:hidden;border:1px solid #e0e7ff}
.bar{height:100%;width:0%;background:linear-gradient(90deg,#6366f1,#06b6d4)}
.pct{min-width:58px;font-weight:700;color:#4338ca}
.mono{font-family:ui-monospace,Menlo,Consolas,monospace;background:#0b1020;color:#e5edff;border-radius:10px;padding:10px;border:1px solid #111827;max-height:260px;overflow:auto}
.badge{display:inline-block;font-size:12px;padding:4px 10px;border-radius:999px;border:1px solid #e5e7eb;background:#f3f4f6;color:#334155;margin-right:6px}
small.muted{color:#64748b}
</style>
<div class="card">
  <h1>استخراج دسته‌های دیجی‌کالا</h1>
  <div class="progress-wrap">
    <div class="track"><div id="bar" class="bar"></div></div>
    <div id="pct" class="pct">0%</div>
  </div>
  <div style="margin-top:10px">
    <span class="badge" id="stat-total">GZ: 0</span>
    <span class="badge" id="stat-done">پردازش‌شده: 0</span>
    <span class="badge" id="stat-cats">دسته‌ها: 0</span>
  </div>
</div>
<div class="card">
  <b>مرحله فعلی:</b>
  <div id="stage" style="margin-top:6px"><small class="muted">در حال شروع…</small></div>
</div>
<div class="card">
  <b>گزارش:</b>
  <pre id="log" class="mono"></pre>
</div>
<script>
function setPct(v){ v=Math.max(0,Math.min(100,Math.round(v))); document.getElementById('bar').style.width=v+'%'; document.getElementById('pct').textContent=v+'%'; }
function setStage(s){ document.getElementById('stage').innerHTML = s; }
function logln(s){ const el=document.getElementById('log'); el.textContent += s + "\n"; el.scrollTop = el.scrollHeight; }
function stat(id, v){ document.getElementById(id).textContent = v; }
</script>
<?php
@ob_flush(); @flush();

/* ---------------- Utils ---------------- */

function log_file($msg){
    @file_put_contents(LOG_FILE, '['.date('Y-m-d H:i:s')."] $msg\n", FILE_APPEND);
}
function http_get($url, $gzip = true, $timeout = 25) {
    $ch = curl_init($url);
    curl_setopt_array($ch, [
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_MAXREDIRS      => 5,
        CURLOPT_CONNECTTIMEOUT => 10,
        CURLOPT_TIMEOUT        => $timeout,
        CURLOPT_USERAGENT      => UA,
        CURLOPT_REFERER        => 'https://www.digikala.com/',
        CURLOPT_ENCODING       => $gzip ? 'gzip,deflate,br' : '',
        CURLOPT_HTTPHEADER     => ['Accept: */*'],
    ]);
    $body = curl_exec($ch);
    $code = (int) curl_getinfo($ch, CURLINFO_HTTP_CODE);
    $err  = curl_error($ch);
    curl_close($ch);
    if ($body === false || $code < 200 || $code >= 300) {
        throw new RuntimeException("HTTP $code for $url" . ($err ? " | $err" : ''));
    }
    usleep(DELAY_US);
    return $body;
}

function parse_sitemap_xml_for_gz($xml) {
    $gz = [];
    $dom = new DOMDocument();
    @$dom->loadXML($xml);
    foreach ($dom->getElementsByTagName('loc') as $loc) {
        $u = trim($loc->nodeValue);
        if (preg_match('~\.gz$~i', $u)) $gz[] = $u;
    }
    return array_values(array_unique($gz));
}

function parse_gz_for_urls($gz_bytes) {
    $xml = @gzdecode($gz_bytes);
    if ($xml === false) $xml = $gz_bytes;
    $urls = [];
    $dom = new DOMDocument();
    @$dom->loadXML($xml);
    foreach ($dom->getElementsByTagName('loc') as $loc) {
        $u = trim($loc->nodeValue);
        if ($u !== '') $urls[] = $u;
    }
    return $urls;
}

function is_category_url($url) {
    $path = parse_url($url, PHP_URL_PATH) ?: '';
    if (preg_match('~^/search/category-[a-z0-9\-]+/?$~i', $path)) return true;
    if (preg_match('~^/main/[a-z0-9\-]+/?$~i', $path)) return true;
    return false;
}

function slug_from_category_url($url) {
    $path = parse_url($url, PHP_URL_PATH) ?: '';
    if (preg_match('~/search/category-([a-z0-9\-]+)/?~i', $path, $m)) return $m[1];
    if (preg_match('~/main/([a-z0-9\-]+)/?~i', $path, $m)) return $m[1];
    return trim($path, '/');
}

function title_from_category_page($url) {
    try {
        $html = http_get($url, true, 20);
        if (preg_match('~<h1[^>]*>(.*?)</h1>~is', $html, $m)) {
            $t = trim(strip_tags($m[1]));
            $t = preg_replace('/\s+/', ' ', $t);
            if ($t !== '') return $t;
        }
        if (preg_match('~<title[^>]*>(.*?)</title>~is', $html, $m)) {
            $t = trim(strip_tags($m[1]));
            $t = preg_replace('/\s+/', ' ', $t);
            if ($t !== '') return $t;
        }
    } catch (\Throwable $e) {}
    return '';
}

function write_outputs(array $rows){
    // JSON
    file_put_contents(OUT_JSON, json_encode($rows, JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES|JSON_PRETTY_PRINT));
    // CSV
    $f = fopen(OUT_CSV, 'w');
    fputcsv($f, ['url','slug','type','title']);
    foreach ($rows as $r) fputcsv($f, [$r['url'],$r['slug'],$r['type'],$r['title']]);
    fclose($f);
}

/* ---------------- Stage 1: get sitemap list ---------------- */

echo "<script>setStage('دریافت <code>robots.txt</code> و مسیر <code>sitemap.xml</code>…');</script>"; @ob_flush(); @flush();
$robots = http_get(ROBOTS_URL);
if (!preg_match('~sitemap:\s*(https?://[^\s]+)~i', $robots, $m)) {
    echo "<script>logln('ERROR: سایت‌مپ در robots.txt پیدا نشد');setStage('خطا: سایت‌مپ پیدا نشد');</script>"; exit;
}
$sitemapUrl = $m[1];
echo "<script>logln('Sitemap index: ".htmlspecialchars($sitemapUrl)."');</script>";
echo "<script>setStage('دانلود <code>sitemap.xml</code>…');</script>"; @ob_flush(); @flush();

$sitemapXml = http_get($sitemapUrl);
$gzUrls = parse_sitemap_xml_for_gz($sitemapXml);
$gzUrls = array_slice($gzUrls, 0, MAX_GZ);
$total = count($gzUrls);

echo "<script>stat('stat-total','GZ: {$total}');</script>";
echo "<script>logln('Found gz files: {$total}');</script>";
@ob_flush(); @flush();

if ($total === 0){
    echo "<script>setStage('هیچ فایل GZ در سایت‌مپ یافت نشد');</script>"; exit;
}

/* ---------------- Resume support ---------------- */
$startIndex = 0;
$cat = [];
$seen = [];

if (isset($_GET['resume']) && file_exists(STATE_FILE)) {
    $state = json_decode(file_get_contents(STATE_FILE), true) ?: [];
    $startIndex = isset($state['index']) ? max(0,(int)$state['index']) : 0;
    $cat = isset($state['categories']) && is_array($state['categories']) ? $state['categories'] : [];
    foreach ($cat as $r){ $seen[$r['url']] = true; }
    echo "<script>logln('Resume from index #{$startIndex}');</script>";
}

/* ---------------- Stage 2: iterate gz files ---------------- */

$done = $startIndex;
echo "<script>stat('stat-done','پردازش‌شده: {$done}'); stat('stat-cats','دسته‌ها: ".count($cat)."'); setPct(".round(($done/$total)*100,2).");</script>"; @ob_flush(); @flush();

for ($i=$startIndex; $i<$total; $i++){
    $gzUrl = $gzUrls[$i];
    $pct = ($i / $total) * 100;
    echo "<script>setPct(".round($pct,2)."); setStage('پردازش فایل GZ ".($i+1)." از {$total}: <code>".htmlspecialchars($gzUrl)."</code>'); logln('GZ ".($i+1)."/{$total}: ".htmlspecialchars($gzUrl).");</script>";
    @ob_flush(); @flush();
    log_file("GZ ".($i+1)."/{$total}: $gzUrl");

    try{
        $bytes = http_get($gzUrl, false, 30);
        $urls  = parse_gz_for_urls($bytes);

        foreach ($urls as $u) {
            if (isset($seen[$u])) continue;
            if (!is_category_url($u)) continue;

            $seen[$u] = true;
            $slug = slug_from_category_url($u);
            $type = (strpos($u, '/search/category-') !== false) ? 'category' : 'main';
            $title = '';
            if (FETCH_TITLES) $title = title_from_category_page($u);

            $cat[] = ['url'=>$u, 'slug'=>$slug, 'type'=>$type, 'title'=>$title];

            if ((count($cat) % 50) === 0){
                echo "<script>stat('stat-cats','دسته‌ها: ".count($cat)."');</script>";
                @ob_flush(); @flush();
            }
        }

        $done = $i + 1;
        echo "<script>stat('stat-done','پردازش‌شده: {$done}'); stat('stat-cats','دسته‌ها: ".count($cat)."');</script>";
        @ob_flush(); @flush();

        // ذخیرهٔ وضعیت برای ادامه (ایمن دربرابر قطع‌شدن)
        file_put_contents(STATE_FILE, json_encode([
            'index'      => $done,
            'categories' => $cat
        ], JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES));

    } catch(Throwable $e){
        $msg = "ERROR on gz #".($i+1)." ($gzUrl): ".$e->getMessage();
        log_file($msg);
        echo "<script>logln('".$msg."'); setStage('❌ خطا روی: <code>".htmlspecialchars($gzUrl)."</code>');</script>";
        echo "<div class='card' style='background:#fff7ed;border-color:#fed7aa'><b>⚠️ خطا:</b> روی فایل زیر متوقف شد:<br><code>".htmlspecialchars($gzUrl)."</code><br><small>برای ادامه، همین صفحه را با <code>?resume=1</code> باز کنید.</small></div>";
        @ob_flush(); @flush();
        exit; // متوقف می‌شویم تا کاربر تصمیم بگیرد (Resume)
    }
}

/* ---------------- Stage 3: finalize & outputs ---------------- */
usort($cat, function($a,$b){
    if ($a['type'] !== $b['type']) return $a['type']==='main' ? -1 : 1;
    return strnatcasecmp($a['slug'],$b['slug']);
});

write_outputs($cat);
@unlink(STATE_FILE);

echo "<script>setPct(100); setStage('✅ اتمام. خروجی‌ها ساخته شد.'); logln('Collected categories: ".count($cat)."');</script>";
echo "<div class='card'>خروجی‌ها آماده شد: 
 <ul>
   <li><a href='<?php echo htmlspecialchars(basename(OUT_JSON)); ?>' target='_blank'><?php echo htmlspecialchars(basename(OUT_JSON)); ?></a></li>
   <li><a href='<?php echo htmlspecialchars(basename(OUT_CSV)); ?>' target='_blank'><?php echo htmlspecialchars(basename(OUT_CSV)); ?></a></li>
 </ul>
</div>";
@ob_flush(); @flush();