#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
盲人医疗按摩题库 统一JSON转换脚本
===================================
扫描指定目录下的所有题库.txt文件，识别不同格式变体，
统一转换为标准JSON格式输出。

支持的格式变体：
  A - 标准逐行：题目 → A/B/C/D选项 → 正确答案：X
  B - 内置答案：题目(字母) → 选项
  C - 底部答题卡：题目在上，答案表在底部
  D - 内联序号答案：题目（序号、字母）→ 选项
  E - 华秀编号格式：带2B/4D前缀选项
  F - 带解析版：答案 + 解析文字
  G - 同排紧凑：A.选项 B.选项 C.选项 D.选项在一行

输出：每个文件生成同名.json，以及一份汇总统计日志。
"""

import os
import re
import json
import logging
import sys
from datetime import datetime
from pathlib import Path
from typing import Optional

# ============================================================
# 配置区
# ============================================================
SOURCE_DIR = Path(__file__).parent  # 脚本所在目录即为题库目录
OUTPUT_DIR = SOURCE_DIR / "standardized_json"
LOG_FILE = SOURCE_DIR / "conversion_log.txt"
ENCODING = "utf-8"

# ============================================================
# 日志系统
# ============================================================
logger = logging.getLogger("QBankConverter")
logger.setLevel(logging.DEBUG)

# 文件日志（全量）
file_handler = logging.FileHandler(LOG_FILE, encoding="utf-8")
file_handler.setLevel(logging.DEBUG)
file_fmt = logging.Formatter("%(asctime)s | %(levelname)-7s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
file_handler.setFormatter(file_fmt)

# 控制台日志（精简）- 使用 safe_console_writer 避免编码问题
import io

class SafeConsoleWriter:
    """安全控制台输出：遇到编码错误时降级为ascii替换。"""
    def __init__(self, stream):
        self.stream = stream
    def write(self, msg):
        try:
            self.stream.write(msg)
        except UnicodeEncodeError:
            self.stream.write(msg.encode('ascii', errors='replace').decode('ascii'))
    def flush(self):
        self.stream.flush()

console_handler = logging.StreamHandler(SafeConsoleWriter(sys.stdout))
console_handler.setLevel(logging.INFO)
console_fmt = logging.Formatter("%(message)s")
console_handler.setFormatter(console_fmt)

logger.addHandler(file_handler)
logger.addHandler(console_handler)

# ============================================================
# 统计器
# ============================================================
class Stats:
    def __init__(self):
        self.total_files = 0
        self.processed_files = 0
        self.skipped_files = 0
        self.failed_files = 0
        self.total_questions = 0
        self.errors = []  # (filename, line_no, detail)

    def log_error(self, fname: str, detail: str, line_no: int = 0):
        tag = f" (行{line_no})" if line_no else ""
        self.errors.append((fname, line_no, detail))
        logger.error(f"  [错误] {fname}{tag}: {detail}")

    def log_warning(self, fname: str, detail: str, line_no: int = 0):
        tag = f" (行{line_no})" if line_no else ""
        logger.warning(f"  [警告] {fname}{tag}: {detail}")

    def summary(self) -> str:
        lines = [
            "=" * 60,
            "转换完成 汇总报告",
            "=" * 60,
            f"总文件数    : {self.total_files}",
            f"成功转换    : {self.processed_files}",
            f"跳过(非题库): {self.skipped_files}",
            f"失败        : {self.failed_files}",
            f"总题数      : {self.total_questions}",
            f"总错误数    : {len(self.errors)}",
        ]
        if self.errors:
            lines.append("")
            lines.append("详细错误列表:")
            for fname, lineno, detail in self.errors:
                loc = f" ({lineno})" if lineno else ""
                lines.append(f"  {fname}{loc}: {detail}")
        lines.append("=" * 60)
        return "\n".join(lines)

stats = Stats()


# ============================================================
# 非题库文件检测（知识汇总类，跳过）
# ============================================================
NON_QBANK_FILES = {
    # 知识汇总/备考资料（无选择题格式）
    "触诊诊断学应试指南.txt",
    "触诊实验.txt",
    "特殊检查.txt",
    "特殊试验.txt",
    "伤内妇儿分类.txt",
    "西医诊断学基础重点内容.txt",
    "选择穴位的能力.txt",
    "选择穴位的能力\ufeff.txt",  # BOM变体（U+FEFF在文件名中）
    "腧穴仪20套题库(1).txt",
    "conversion_log.txt",  # 脚本自身日志
}

# 短文件名匹配（如"2023.06.14.txt"内含题目但文件名不在列表中，用内容检测）
def is_likely_question_bank(text: str) -> bool:
    """
    通过内容特征判断是否为题库文件。
    题库文件通常含：题号、A/B/C/D选项、正确答案等。
    """
    lines = [l.strip() for l in text.split("\n") if l.strip()]

    # 统计特征
    has_numbered_question = False
    has_abcd_options = False
    has_answer_marker = False
    option_count = 0

    for line in lines[:60]:  # 只检查前60行
        # 匹配题号：数字开头后跟分隔符 或 直接跟中文
        # 如 "1、题目" "1.题目" "1：题目" "1题目"
        if re.match(r'^\d+(?:[、.．:：\s]|(?=[^\d\s]))', line):
            has_numbered_question = True

        # 匹配选项 A/B/C/D（带分隔符或直接跟文本如 "A心主血"）
        if re.match(r'^[A-Da-d](?:[、.．:：\s]?\s*\S)', line):
            option_count += 1

        # 匹配正确答案标记
        if re.search(r'正确[答案]{0,2}[：:]', line) or re.search(r'^(答|答案)[：:]', line):
            has_answer_marker = True

    has_abcd_options = option_count >= 6  # 至少出现6次选项标记

    # 如果"A.选项 B.选项 C.选项 D.选项"在一行
    inline_abcd = bool(re.search(r'[A-Da-d][.．、]+\S+[ \t]+[A-Da-d][.．、]+\S+', text[:2000]))

    return (has_numbered_question and has_abcd_options) or \
           (has_abcd_options and has_answer_marker) or \
           (inline_abcd and has_answer_marker)


# ============================================================
# 格式检测
# ============================================================
class QuestionFormat:
    """格式类型枚举"""
    STANDARD = "standard"      # 逐行标准：题→A→B→C→D→答案
    INLINE_ANSWER = "inline"   # 题目自带答案：题(字母)
    BOTTOM_KEY = "bottom_key"  # 底部答题卡
    INLINE_NUM = "inline_num"  # 内联序号：题（序号、字母）
    HUAXIU = "huaxiu"         # 华秀编号：2B/4D
    ANALYSIS = "analysis"      # 解析版：答案+解析
    TAB_INLINE = "tab_inline"  # Tab同行：A.opt\tB.opt\tC.opt\tD.opt
    UNKNOWN = "unknown"

def detect_format(text: str) -> str:
    """
    检测文本的格式类型。返回 QuestionFormat 中的值。
    """
    lines = [l.rstrip("\n\r") for l in text.split("\n")]
    head = "\n".join(lines[:50])

    # 特征1：华秀题库——选项前有数字如 "2B：内容" 或 "4D：内容"
    huaxiu_opts = re.findall(r'^[ \t]*\d+[A-Da-d][：:]', head, re.MULTILINE)
    if len(huaxiu_opts) >= 3:
        return QuestionFormat.HUAXIU

    # 特征2：解析版——"答案：X 解析文字" 模式
    analysis_pattern = re.search(r'^答案[：:]\s*[A-Da-d]\s+\S', head, re.MULTILINE)
    if analysis_pattern:
        return QuestionFormat.ANALYSIS

    # 特征3：底部答题卡——文件底部有 "1-10 ABBCD" 这种行
    tail_text = "\n".join(lines[-30:])
    bottom_key_pattern = re.findall(r'^\d+\s*[-—~]\s*\d+\s+[A-Da-d\sX]{3,}', tail_text, re.MULTILINE)
    if len(bottom_key_pattern) >= 2:
        return QuestionFormat.BOTTOM_KEY

    # 特征4：Tab同行——选项在一行用Tab或空格分隔（至少2个选项可行）
    tab_inline = re.findall(r'^[A-Da-d][.．、]?\s*\S+[\t ]+[A-Da-d][.．、]?\s*\S+', head, re.MULTILINE)
    if len(tab_inline) >= 2:
        return QuestionFormat.TAB_INLINE

    # 特征5：内联序号——"题目（序号、字母）"
    inline_num = re.findall(r'（[\d\s]+[、,，]\s*[A-Da-d]）', head)
    if len(inline_num) >= 3:
        return QuestionFormat.INLINE_NUM

    # 特征6：题目自带答案——"题目标题(字母)"
    inline_ans = re.findall(r'（[A-Da-d]）', head)
    if len(inline_ans) >= 3:
        return QuestionFormat.INLINE_ANSWER

    # 特征7：标准逐行——"正确答案：X" 出现多次
    standard_ans = re.findall(r'正确[答案]{0,2}[：:]\s*[A-Da-d]', head)
    if len(standard_ans) >= 3:
        return QuestionFormat.STANDARD

    return QuestionFormat.UNKNOWN


# ============================================================
# 清洗辅助函数
# ============================================================
def clean_option_text(t: str) -> str:
    """清洗选项文本：去除首尾空白、统一标点。"""
    t = t.strip().rstrip("；;，,。.　 ")
    return t

def normalize_answer(ans: str) -> str:
    """统一答案字母为大写。"""
    ans = ans.strip().upper()
    # 去掉数字前缀（华秀格式可能"2B"，提取B）
    m = re.search(r'[A-D]', ans)
    return m.group(0) if m else ans

def qtype_from_stem(stem: str) -> str:
    """根据题干判断题型。"""
    s = stem.strip()
    if re.search(r'[多选下列|哪些|不包括|不属于|错误的是|错误的是|错误的|正确的|正确的是|正确的是]', s):
        # 有些单选题也用"错误的是"，先都默认single
        pass
    return "single"

def extract_subject_from_filename(filename: str, fullpath: str) -> str:
    """从文件名和目录路径推测科目。"""
    fname = filename.lower()
    path = fullpath.lower()

    subject_map = [
        (["中医基础", "中基", "中基、解剖"], "中医基础理论"),
        (["解剖"], "解剖学"),
        (["经络", "腧穴"], "经络腧穴学"),
        (["触诊"], "触诊诊断学"),
        (["推拿", "按摩"], "按摩学基础"),
        (["中诊", "诊断"], "中医诊断学"),
        (["西诊", "西医诊断"], "西医诊断学"),
        (["儿科", "小儿"], "儿科推拿学"),
        (["妇科"], "妇科按摩学"),
        (["内科", "内妇儿"], "内科按摩学"),
        (["伤科"], "伤科按摩学"),
        (["伤内妇儿"], "临床综合"),
        (["模拟试题"], "模拟试题"),
        (["综合笔试", "综合笔试模拟"], "模拟试题"),
        (["光明题库", "2023习题", "2023.06"], "综合练习"),
        (["华秀"], "综合练习"),
        (["忙医考", "盲医考"], "综合"),
    ]

    for keywords, subject in subject_map:
        if any(k in fname for k in keywords) or any(k in path for k in keywords):
            return subject
    return ""


# ============================================================
# 各格式解析器
# ============================================================

def parse_standard_format(lines: list, filename: str, has_analysis: bool = False) -> list:
    """
    解析标准逐行格式：
    题号、题目
    A 选项
    B 选项
    C 选项
    D 选项    (可能有3或4个)
    正确答案：X
    """
    questions = []
    i = 0
    q_id = 0

    while i < len(lines):
        line = lines[i].rstrip("\n\r")

        # 跳过空行、标题、说明等
        if not line.strip():
            i += 1
            continue

        # 跳过明显非题目的行
        stripped = line.strip()
        if re.match(r'^(单项选择题|答题说明|一、|二、|三、|全国医疗|盲人医疗)', stripped):
            i += 1
            continue

        # 尝试匹配题号：支持 "1、题目" "1.题目" "1：题目" "1 题目" "1题目"
        q_match = re.match(r'^[\s]*(\d+)(?:[、.．:：\s]|(?=[^\d\s]))\s*(.*?)$', stripped)

        if q_match:
            q_num = int(q_match.group(1))
            stem = q_match.group(2).strip()

            # 清理题干
            stem = stem.rstrip("（(）)")

            # 收集接下来的选项
            options = {}
            opt_count = 0
            j = i + 1
            answer = ""
            analysis = ""

            # 跳过可能的空行
            while j < len(lines) and not lines[j].strip():
                j += 1

            # 读取选项
            found_answer = False
            option_keys = []

            while j < len(lines):
                opt_line = lines[j].strip()
                if not opt_line:
                    j += 1
                    continue

                # 检查是否是答案行
                ans_match = re.match(
                    r'(?:正确[答案]{0,2}|答|答案)[：:]?\s*([A-Da-d]+|无答案|不知道)\s*(.*?)$',
                    opt_line
                )
                if ans_match:
                    raw_ans = ans_match.group(1).strip()
                    if raw_ans in ['无答案', '不知道']:
                        # 自动添加"无答案"选项（用下一个可用字母）
                        next_key = chr(ord('A') + len(options))
                        options[next_key] = '无答案'
                        answer = next_key
                    else:
                        answer = normalize_answer(raw_ans)
                    analysis_text = ans_match.group(2).strip()
                    if analysis_text:
                        analysis = analysis_text

                    # 解析版可能后续还有解析（多行解析文本）
                    if has_analysis:
                        j += 1
                        extra_analysis = []
                        while j < len(lines):
                            nl = lines[j].strip()
                            # 如果下一行是题号或新答案或选项A/B开头，就停止
                            if (re.match(r'^\d+[、.．:：\s]', nl) or
                                re.match(r'^[A-Da-d][、.．:：]', nl) or
                                re.match(r'^(正确[答案]{0,2}|答|答案)[：:]', nl)):
                                break
                            if nl:
                                extra_analysis.append(nl)
                            j += 1
                        if extra_analysis:
                            if analysis:
                                analysis += " " + " ".join(extra_analysis)
                            else:
                                analysis = " ".join(extra_analysis)
                        j -= 1  # 补偿外层的 j+=1

                    found_answer = True
                    break

                # 检查是否是选项行
                # 支持 "A选项"（无分隔符）和 "A：选项" 等多种格式
                opt_match = re.match(r'^([A-Da-d])\s*[、.．:：\s]?\s*(.*?)$', opt_line)
                if opt_match and opt_match.group(2).strip():
                    key = opt_match.group(1).upper()
                    opt_text = clean_option_text(opt_match.group(2))
                    # 过滤（避免把答案行误作选项）
                    if not re.match(r'^(正确[答案]|答|答案)', opt_text):
                        options[key] = opt_text
                        option_keys.append(key)
                        opt_count += 1
                        j += 1
                        continue

                # 检查是否是下一题（题号开头）
                if re.match(r'^\d+(?:[、.．:：\s]|(?=[^\d\s]))', opt_line):
                    break

                # 其他内容跳过
                j += 1

            if found_answer:
                # 处理只有3个选项的情况
                sorted_options = {}
                for k in sorted(options.keys()):
                    sorted_options[k] = options[k]

                answer_list = [answer] if answer else []

                q_id += 1
                q = {
                    "id": q_num if q_num else q_id,
                    "type": qtype_from_stem(stem),
                    "stem": stem,
                    "options": sorted_options,
                    "answer": answer_list,
                }
                if analysis:
                    q["analysis"] = analysis.strip()

                questions.append(q)

            i = j
            continue

        i += 1

    return questions


def parse_inline_answer_format(lines: list, filename: str) -> list:
    """
    解析题目自带答案格式：
    N.题目内容(字母)
    空行
    A 选项
    空行
    B 选项
    ...
    """
    questions = []
    i = 0

    text = "\n".join(lines)

    # 用题号分割
    # 模式: 数字开头，可能跟着.)等，有（字母）在行内
    pattern = re.compile(
        r'^[\s]*(\d+)[、.．:：\s]\s*(.*?)[（(]\s*([A-Da-d])\s*[）)](.*?)$',
        re.MULTILINE
    )

    # 找出所有题目行及其位置
    matches = list(pattern.finditer(text))

    for idx, m in enumerate(matches):
        q_num = int(m.group(1))
        stem = m.group(2).strip()
        answer = normalize_answer(m.group(3))
        tail = m.group(4).strip()

        if tail:
            stem = stem + " " + tail

        # 从当前匹配结束位置到下一匹配开始位置之间找选项
        start_pos = m.end()
        end_pos = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
        section = text[start_pos:end_pos]

        # 提取 A/B/C/D 选项
        options = {}
        opt_matches = re.finditer(
            r'^[ \t]*([A-Da-d])\s*[、.．:：\s]\s*(.*?)$',
            section,
            re.MULTILINE
        )
        for om in opt_matches:
            key = om.group(1).upper()
            opt_text = clean_option_text(om.group(2))
            options[key] = opt_text

        # 补全缺失的选项（如果题目有4个选项但文件可能只有3个）
        if options and answer:
            q = {
                "id": q_num,
                "type": "single",
                "stem": stem,
                "options": dict(sorted(options.items())),
                "answer": [answer],
            }
            questions.append(q)

    return questions


def parse_bottom_key_format(lines: list, filename: str) -> list:
    """
    解析底部答题卡格式：
    题目部分在上，底部有 "1-10 ABBBC" 格式的答案表。
    """
    questions = []
    line_texts = [l.rstrip("\n\r") for l in lines]
    q_count = len(line_texts)

    # 先提取底部答案表
    answer_map = {}  # q_num -> letter

    for line in line_texts:
        stripped = line.strip()
        # 匹配 "1-10 ABBBC" 或 "1~10 ABBBC"
        key_match = re.match(r'^(\d+)\s*[-—~]\s*(\d+)\s+([A-Da-d\sX]+)', stripped)
        if key_match:
            start = int(key_match.group(1))
            end = int(key_match.group(2))
            answers_str = key_match.group(3).strip().upper()
            # 去掉空格
            answers_str = answers_str.replace(" ", "")
            for j, letter in enumerate(answers_str):
                qn = start + j
                if letter in "ABCD":
                    answer_map[qn] = letter

    if not answer_map:
        stats.log_error(filename, "底部答题卡格式：未找到答案表行")
        return questions

    # 解析题目
    i = 0
    while i < len(line_texts):
        line = line_texts[i].strip()
        if not line:
            i += 1
            continue

        # 跳过说明性行
        if re.match(r'^(单项选择题|答题说明|一、|二、|盲人医疗|题型)', line):
            i += 1
            continue

        # 匹配题号
        q_match = re.match(r'^[\s]*(\d+)(?:[、.．:：\s]|(?=[^\d\s]))\s*(.*?)$', line)
        if q_match:
            q_num = int(q_match.group(1))
            stem_text = q_match.group(2).strip()
            stem_text = stem_text.rstrip("（(）)")

            # 找选项
            options = {}
            j = i + 1
            while j < len(line_texts):
                nl = line_texts[j].strip()
                if not nl:
                    j += 1
                    continue
                # 下一题
                if re.match(r'^\d+[、.．:：]', nl):
                    break
                # 选项
                opt_match = re.match(r'^([A-Da-d])[.．、]?\s*(.*?)$', nl)
                if opt_match:
                    key = opt_match.group(1).upper()
                    opt_text = clean_option_text(opt_match.group(2))
                    # 过滤掉占位选项行（比如"该题不全"这种）
                    if len(opt_text) > 0:
                        options[key] = opt_text
                    j += 1
                else:
                    j += 1

            answer = answer_map.get(q_num, "")
            if answer:
                q = {
                    "id": q_num,
                    "type": "single",
                    "stem": stem_text,
                    "options": dict(sorted(options.items())),
                    "answer": [answer],
                }
                questions.append(q)
            else:
                stats.log_warning(filename, f"第{q_num}题未在底部找到对应答案", i)

            i = j
            continue

        i += 1

    return questions


def parse_inline_num_format(lines: list, filename: str) -> list:
    """
    解析内联序号格式：
    N、题目（序号、字母）
    A：选项
    B：选项
    C：选项
    D：选项
    """
    questions = []
    text = "\n".join(lines)

    pattern = re.compile(
        r'^[\s]*(\d+)[、.．:：\s]\s*(.*?)[（(]\s*[\d\s]+[、,，]\s*([A-Da-d])\s*[）)](.*?)$',
        re.MULTILINE
    )

    # 找出所有题目
    matches = list(pattern.finditer(text))

    for idx, m in enumerate(matches):
        q_num = int(m.group(1))
        stem = m.group(2).strip()
        answer = normalize_answer(m.group(3))
        tail = m.group(4).strip()
        if tail:
            stem += " " + tail

        # 提取选项
        start_pos = m.end()
        end_pos = matches[idx + 1].start() if idx + 1 < len(matches) else len(text)
        section = text[start_pos:end_pos]

        options = {}
        opt_matches = re.finditer(
            r'^[ \t]*([A-Da-d])\s*[、.．:：]\s*(.*?)$',
            section,
            re.MULTILINE
        )
        for om in opt_matches:
            key = om.group(1).upper()
            opt_text = clean_option_text(om.group(2))
            options[key] = opt_text

        if answer and options:
            q = {
                "id": q_num,
                "type": "single",
                "stem": stem,
                "options": dict(sorted(options.items())),
                "answer": [answer],
            }
            questions.append(q)

    return questions


def parse_huaxiu_format(lines: list, filename: str) -> list:
    """
    解析华秀题库格式：
    N. 题目
    A：选项
    2B：选项    ← 注意数字前缀
    C：选项
    4D：选项
    正确答案：2B 或 正确答案：a
    """
    questions = []
    i = 0
    text = "\n".join(lines)

    # 先修正文本：去掉选项前的数字前缀
    # "2B：" → "B：" 但保留原答案引用
    # 用题号分割
    q_blocks = re.split(r'\n(?=\s*\d+[.．、]\s)', text)

    for block in q_blocks:
        block = block.strip()
        if not block:
            continue

        block_lines = block.split("\n")

        # 找题号行
        first = block_lines[0].strip()
        q_match = re.match(r'^[\s]*(\d+)[.．、]\s*(.*?)$', first)
        if not q_match:
            continue

        q_num = int(q_match.group(1))
        stem = q_match.group(2).strip()

        options = {}
        answer = ""
        analysis_text = ""

        for line in block_lines[1:]:
            line = line.strip()
            if not line:
                continue

            # 答案行
            ans_match = re.match(r'(?:正确[答案]{0,2}|答|答案)[：:]?\s*(\d*[A-Da-d])\s*(.*?)$', line)
            if ans_match:
                raw_ans = ans_match.group(1).strip()
                answer = normalize_answer(raw_ans)
                analysis_text = ans_match.group(2).strip()
                continue

            # 选项行（可能带数字前缀）
            opt_match = re.match(r'^(\d*)([A-Da-d])\s*[：:]\s*(.*?)$', line)
            if opt_match:
                key = opt_match.group(2).upper()
                opt_text = clean_option_text(opt_match.group(3))
                options[key] = opt_text
                continue

            # 也尝试标准选项格式
            opt_match2 = re.match(r'^([A-Da-d])\s*[、.．:：\s]\s*(.*?)$', line)
            if opt_match2:
                key = opt_match2.group(1).upper()
                opt_text = clean_option_text(opt_match2.group(2))
                options[key] = opt_text

        if answer and options:
            q = {
                "id": q_num,
                "type": "single",
                "stem": stem,
                "options": dict(sorted(options.items())),
                "answer": [answer],
            }
            if analysis_text:
                q["analysis"] = analysis_text
            questions.append(q)
        else:
            stats.log_warning(filename, f"第{q_num}题解析不完整（选项={list(options.keys())}, 答案={answer})")

    return questions


def parse_analysis_format(lines: list, filename: str) -> list:
    """
    解析版格式：
    N.题目( )
    A.选项1 B.选项2 C.选项3 D.选项4
    答案：X
    解析：......
    """
    questions = []
    i = 0

    while i < len(lines):
        line = lines[i].rstrip("\n\r")
        stripped = line.strip()
        if not stripped:
            i += 1
            continue

        # 匹配题号
        q_match = re.match(r'^[\s]*(\d+)(?:[.．、]|(?=[^\d\s]))\s*(.*?)$', stripped)
        if not q_match:
            i += 1
            continue

        q_num = int(q_match.group(1))
        stem = q_match.group(2).strip().rstrip("（(）)").strip()

        # 读后续行
        j = i + 1
        options = {}
        answer = ""
        analysis = ""

        # 选项可能在一行用空格/tab分隔
        abcd_line_found = False

        while j < len(lines):
            nl = lines[j].strip()
            if not nl:
                j += 1
                continue

            # 答案行
            ans_match = re.match(r'^答案[：:]\s*([A-Da-d])\s*(.*?)$', nl)
            if ans_match:
                answer = normalize_answer(ans_match.group(1))
                rest = ans_match.group(2).strip()
                if rest:
                    analysis = rest
                break

            # 标准选项行（逐行）
            opt_match = re.match(r'^([A-Da-d])[.．、]?\s*(.*?)$', nl)
            if opt_match:
                key = opt_match.group(1).upper()
                opt_text = clean_option_text(opt_match.group(2))
                options[key] = opt_text
                j += 1
                continue

            # 同排选项 "A.xxx B.xxx C.xxx D.xxx"
            inline_opts = re.findall(r'([A-Da-d])[.．、]?\s*([^A-Da-d]+?)(?=[\t ]+[A-Da-d](?:\s*[.．、])?|$)', nl)
            if len(inline_opts) >= 3 and not abcd_line_found:
                for k, v in inline_opts:
                    options[k.upper()] = clean_option_text(v)
                abcd_line_found = True
                j += 1
                continue

            # 下一题
            if re.match(r'^\d+[.．、]', nl):
                break

            j += 1

        # 收集多行解析
        if analysis:
            j2 = j + 1
            extra = []
            while j2 < len(lines):
                nl2 = lines[j2].strip()
                if (re.match(r'^\d+[.．、]', nl2) or
                    re.match(r'^[\s]*\d+[.．、]', nl2)):
                    break
                if nl2 and not re.match(r'^[A-Da-d][.．、]', nl2):
                    extra.append(nl2)
                j2 += 1
            if extra:
                analysis += " " + " ".join(extra)

        if answer and options:
            q = {
                "id": q_num,
                "type": "single",
                "stem": stem,
                "options": dict(sorted(options.items())),
                "answer": [answer],
            }
            if analysis:
                q["analysis"] = analysis.strip()
            questions.append(q)

        i = j if j > i else i + 1

    return questions


def parse_tab_inline_format(lines: list, filename: str) -> list:
    """
    Tab/空格同行选项格式：
    N.题目（ ）
    A.选项1\tB.选项2\tC.选项3\tD.选项4

    或：
    N、题目（ ）
    A、选项1\tB、选项2\tC、选项3\tD、选项4

    答案可能在题干括号中，或后续"正确答案：X"行。
    增强版：处理 R前缀、拆分选项行、不知道/无答案、多字母答案等。
    """
    questions = []
    i = 0
    consec_noanswer_count = 0  # 连续无答案计数器，帮助检测格式

    while i < len(lines):
        line = lines[i].rstrip("\n\r")
        stripped = line.strip()
        if not stripped:
            i += 1
            continue

        # 跳过非题目行
        if re.match(r'^(题型|单项选择|一、|二、|模拟|盲人|医疗|感谢|盲人医师)', stripped):
            i += 1
            continue

        # 匹配题号：允许R等前缀（如 R19、R）
        q_match = re.match(r'^[\s]*[A-Za-z]?\s*(\d+)\s*(?:[、.．:：\s]|(?=[^\d\s]))\s*(.*?)$', stripped)
        if not q_match:
            # 尝试匹配只有R前缀但无数字的情况（R、X线... → 应从上下文得到题号）
            q_match = re.match(r'^[\s]*[A-Za-z]\s*[、.．:：]?\s*(.*?)$', stripped)
            if q_match and stripped[0].upper() == 'R':
                # 这种行是R开头无数字，需要根据上下文推题号
                # 暂时跳过，靠后续逻辑处理
                i += 1
                continue
            i += 1
            continue

        q_num = int(q_match.group(1))
        stem_text = q_match.group(2).strip().rstrip("（(）)").strip()
        if not stem_text and q_match.group(0).strip().upper().startswith('R'):
            stem_text = stripped.lstrip('Rr，,、 ').strip()

        # 找选项：可能在当前行后面，也可能在下一行
        options = {}
        answer = ""
        analysis_text = ""

        # 先在当前行找选项（同行选项如 "A.选项 B.选项 C.选项 D.选项"）
        inline_opts = re.findall(
            r'([A-Da-d])\s*[.．、]?\s*([^A-Da-d\t\n]+?)(?=[\t ]+[A-Da-d](?:\s*[.．、])?|$)',
            stripped
        )
        if len(inline_opts) >= 2:
            for k, v in inline_opts:
                options[k.upper()] = clean_option_text(v)

        # 如果当前行没有足够的选项，看后续行
        if len(options) < 3:
            j = i + 1
            pending_opt_key = None  # 处理拆分选项（A 单独一行，内容在下一行）
            found_any_opt = False

            while j < len(lines):
                next_line = lines[j].strip()
                if not next_line:
                    if pending_opt_key:
                        # 空行结束挂起的选项
                        pending_opt_key = None
                    j += 1
                    continue

                # 优先检查下一题开始（防止将下题的选项误归入本题）
                if re.match(r'^[\s]*[A-Za-z]?\s*\d+\s*[、.．:：]', next_line):
                    break

                # 检查答案行
                ans_match = re.match(
                    r'(?:正确[答案]{0,2}|答|答案)\s*[：:]?\s*([A-Da-d]+|无答案|不知道)\s*(.*?)$',
                    next_line
                )
                if ans_match:
                    raw_ans = ans_match.group(1).strip().upper()
                    ans_rest = ans_match.group(2).strip()
                    # 处理"无答案"/"不知道" → 添加"无答案"选项
                    if raw_ans in ['不知道', '无答案']:
                        next_key = chr(ord('A') + len(options))
                        options[next_key] = '无答案'
                        answer = next_key
                    else:
                        # 多字母答案如 "CD"
                        if len(raw_ans) > 1 and all(c in 'ABCD' for c in raw_ans):
                            answer = raw_ans
                        else:
                            answer = raw_ans[0] if raw_ans else ""
                    if ans_rest and not ans_rest.startswith('。') and not re.match(r'^[。，,、]', ans_rest):
                        analysis_text = ans_rest
                    break

                # 处理拆分选项行：单独一个字母的行被下一行承接
                if pending_opt_key:
                    # 下一行是选项内容
                    content_match = re.match(
                        r'[、.．,，]?\s*(.*?)(?=\s*[A-Da-d]\s*[、.．]|$)',
                        next_line
                    )
                    if content_match:
                        opt_text = clean_option_text(content_match.group(1))
                        if opt_text:
                            options[pending_opt_key] = opt_text
                    pending_opt_key = None
                    j += 1
                    continue

                # 检查同行选项（多个选项在一行用空格分隔）
                # 增强版：处理没有空格分隔的情况如 "文D"
                inline_opts2 = re.findall(
                    r'([A-Da-d])\s*[、.．]?\s*([^A-Da-d\t\n]+?)(?=[\t \n]+[A-Da-d](?:\s*[、.．])?|$)',
                    next_line
                )
                if len(inline_opts2) >= 2:
                    found_any_opt = True
                    for k, v in inline_opts2:
                        options[k.upper()] = clean_option_text(v)
                    # 检查最后一个选项后面是否还有文本（如 "10正确答案：B" 无空格）
                    last_opt_end = 0
                    for m in re.finditer(
                        r'([A-Da-d])\s*[、.．]?\s*[^A-Da-d\t\n]+',
                        next_line
                    ):
                        last_opt_end = m.end()
                    rest_after_opts = next_line[last_opt_end:].strip()
                    if rest_after_opts:
                        # 检查是否包含 "正确答案：X"
                        merged_ans = re.match(
                            r'(?:正确[答案]{0,2}|答|答案)\s*[：:]?\s*([A-Da-d]+)\s*(.*?)$',
                            rest_after_opts
                        )
                        if merged_ans:
                            raw_ans = merged_ans.group(1).strip().upper()
                            if all(c in 'ABCD' for c in raw_ans):
                                answer = raw_ans if len(raw_ans) == 1 else raw_ans
                            analysis_text = merged_ans.group(2).strip()
                    i = j  # 跳过选项行
                    j += 1
                    continue

                # 检查单行选项（选项字母单独在一行，内容在后续行）
                single_opt = re.match(r'^[\s]*([A-Da-d])\s*[、.．]?\s*$', next_line)
                if single_opt:
                    pending_opt_key = single_opt.group(1).upper()
                    j += 1
                    continue

                # 检查是否是标准逐行选项（A开头）
                opt_single = re.match(r'^[\s]*([A-Da-d])\s*[、.．]?\s*(.+)$', next_line)
                if opt_single:
                    key = opt_single.group(1).upper()
                    opt_text = clean_option_text(opt_single.group(2))
                    if opt_text and len(opt_text) >= 1:
                        # 检查这一行是否包含多个选项（A内容 B内容）
                        more_opts = re.findall(
                            r'([A-Da-d])\s*[、.．]?\s*([^A-Da-d]+?)(?=[\t ]+[A-Da-d](?:\s*[、.．])?|$)',
                            next_line
                        )
                        if len(more_opts) >= 2:
                            for k, v in more_opts:
                                options[k.upper()] = clean_option_text(v)
                            found_any_opt = True
                        else:
                            options[key] = opt_text
                            found_any_opt = True
                        i = j
                        j += 1
                        continue

                # 检查下一题开始
                if re.match(r'^[\s]*[A-Za-z]?\s*\d+\s*[、.．:：]', next_line):
                    break

                j += 1

        # 有时候答案就是题干中括号里的字母
        if not answer:
            ans_in_stem = re.search(r'[（(]\s*([A-Da-d])\s*[）)]', q_match.group(0))
            if ans_in_stem:
                answer = normalize_answer(ans_in_stem.group(1))

        # 处理多字母答案（拆分为列表）
        answer_list = []
        if answer:
            if len(answer) > 1 and all(c in 'ABCD' for c in answer):
                answer_list = list(answer)
            elif len(answer) == 1 and answer.isalpha():
                answer_list = [answer]

        # 只要有选项就输出（即使无答案也输出，留给用户手动补充）
        if options:
            q = {
                "id": q_num,
                "type": "single",
                "stem": stem_text,
                "options": dict(sorted(options.items())),
            }
            if answer_list:
                q["answer"] = answer_list
            else:
                q["answer"] = []
            if analysis_text:
                q["analysis"] = analysis_text
            questions.append(q)

        i += 1

    return questions


# ============================================================
# 主处理逻辑
# ============================================================

def process_file(filepath: Path) -> Optional[list]:
    """处理单个文件，返回题目列表。"""
    fname = filepath.name
    relpath = str(filepath.relative_to(SOURCE_DIR))

    logger.info(f"\n{'─' * 50}")
    logger.info(f"处理文件: {relpath}")

    try:
        # 尝试多种编码读取
        raw_text = None
        for enc in ["utf-8", "utf-8-sig", "gbk", "gb2312", "gb18030", "big5"]:
            try:
                with open(filepath, "r", encoding=enc) as f:
                    raw_text = f.read()
                logger.debug(f"  编码: {enc}")
                break
            except (UnicodeDecodeError, UnicodeError):
                continue

        if raw_text is None:
            stats.log_error(fname, "无法解码文件（尝试了utf-8/gbk/big5）")
            return None

        # 预处理：去掉行首 bullet 前缀（•），使解析器能识别题号/选项
        raw_text = re.sub(r'^•\s*', '', raw_text, flags=re.MULTILINE)

        # 检测是否为题库
        if not is_likely_question_bank(raw_text):
            stats.skipped_files += 1
            logger.info(f"  [跳过] 非题库内容")
            return None

        # 检测格式
        fmt = detect_format(raw_text)
        logger.info(f"  格式: {fmt}")

        lines = raw_text.split("\n")
        questions = []

        # 根据格式调用对应解析器
        if fmt == QuestionFormat.STANDARD:
            questions = parse_standard_format(lines, fname)
        elif fmt == QuestionFormat.INLINE_ANSWER:
            questions = parse_inline_answer_format(lines, fname)
        elif fmt == QuestionFormat.BOTTOM_KEY:
            questions = parse_bottom_key_format(lines, fname)
        elif fmt == QuestionFormat.INLINE_NUM:
            questions = parse_inline_num_format(lines, fname)
        elif fmt == QuestionFormat.HUAXIU:
            questions = parse_huaxiu_format(lines, fname)
        elif fmt == QuestionFormat.ANALYSIS:
            questions = parse_analysis_format(lines, fname)
        elif fmt == QuestionFormat.TAB_INLINE:
            questions = parse_tab_inline_format(lines, fname)
        else:
            # 未知格式，尝试所有解析器
            for parser_name, parser_func in [
                ("standard", lambda: parse_standard_format(lines, fname)),
                ("inline_answer", lambda: parse_inline_answer_format(lines, fname)),
                ("inline_num", lambda: parse_inline_num_format(lines, fname)),
                ("huaxiu", lambda: parse_huaxiu_format(lines, fname)),
                ("tab_inline", lambda: parse_tab_inline_format(lines, fname)),
                ("analysis", lambda: parse_analysis_format(lines, fname)),
                ("bottom_key", lambda: parse_bottom_key_format(lines, fname)),
            ]:
                try:
                    result = parser_func()
                    if len(result) >= 3:
                        questions = result
                        logger.info(f"  回退格式匹配: {parser_name}（{len(result)}题）")
                        break
                except Exception:
                    continue

        if not questions:
            stats.log_error(fname, "未能解析出任何题目")
            return None

        # 后处理：编号连续化
        for idx, q in enumerate(questions):
            # 如果id跳跃过大，重新编号
            pass

        return questions

    except Exception as e:
        stats.log_error(fname, f"处理异常: {str(e)}")
        logger.exception("  异常详情:")
        return None


def build_meta(filename: str, questions: list, source_path: str) -> dict:
    """构建元数据。"""
    subject = extract_subject_from_filename(filename, source_path)
    meta = {
        "title": filename.replace(".txt", ""),
        "source": source_path,
        "totalQuestions": len(questions),
        "version": "1.0",
        "createdAt": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    }
    if subject:
        meta["subject"] = subject
    return meta


def output_json(filename: str, questions: list, source_path: str):
    """输出标准化JSON文件。"""
    meta = build_meta(filename, questions, source_path)

    output = {
        "meta": meta,
        "questions": questions,
    }

    json_filename = filename.replace(".txt", ".json")
    output_path = OUTPUT_DIR / json_filename

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    logger.info(f"  + 输出: {output_path.name} ({len(questions)}题)")
    stats.total_questions += len(questions)


def main():
    """主入口。"""
    logger.info("=" * 60)
    logger.info("盲人医疗按摩题库 JSON 转换工具")
    logger.info(f"源目录: {SOURCE_DIR}")
    logger.info(f"输出目录: {OUTPUT_DIR}")
    logger.info(f"开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    logger.info("=" * 60)

    # 创建输出目录
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    # 收集所有 .txt 文件
    txt_files = []
    # 扫描主目录
    for f in SOURCE_DIR.iterdir():
        if f.is_file() and f.suffix.lower() == ".txt":
            txt_files.append(f)
    # 扫描子目录
    for subdir in SOURCE_DIR.iterdir():
        if subdir.is_dir():
            for f in subdir.iterdir():
                if f.is_file() and f.suffix.lower() == ".txt":
                    txt_files.append(f)

    stats.total_files = len(txt_files)
    logger.info(f"发现 {len(txt_files)} 个 .txt 文件\n")

    for filepath in sorted(txt_files):
        fname = filepath.name

        # 跳过已知的非题库文件
        if fname in NON_QBANK_FILES:
            stats.skipped_files += 1
            logger.info(f"[跳过] {filepath.relative_to(SOURCE_DIR)} (知识汇总)")
            continue

        questions = process_file(filepath)
        if questions is None:
            stats.failed_files += 1
        else:
            relpath = str(filepath.relative_to(SOURCE_DIR))
            output_json(fname, questions, relpath)
            stats.processed_files += 1

    # 输出汇总
    logger.info(f"\n{stats.summary()}")

    # 写入日志到汇总文件
    with open(OUTPUT_DIR / "summary.json", "w", encoding="utf-8") as f:
        summary_data = {
            "conversionTime": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "sourceDir": str(SOURCE_DIR),
            "totalFiles": stats.total_files,
            "processedFiles": stats.processed_files,
            "skippedFiles": stats.skipped_files,
            "failedFiles": stats.failed_files,
            "totalQuestions": stats.total_questions,
            "errors": [
                {"file": e[0], "line": e[1], "detail": e[2]}
                for e in stats.errors
            ],
        }
        json.dump(summary_data, f, ensure_ascii=False, indent=2)


if __name__ == "__main__":
    main()
