diff --git a/backend/src/common/document/ExtractionClient.ts b/backend/src/common/document/ExtractionClient.ts index ab2bfd07..67302dae 100644 --- a/backend/src/common/document/ExtractionClient.ts +++ b/backend/src/common/document/ExtractionClient.ts @@ -54,6 +54,7 @@ export interface ForensicsTable { headers?: string[]; rowCount: number; colCount: number; + issues?: ForensicsIssue[]; // 每个表格的问题列表 } export interface ForensicsIssue { @@ -354,7 +355,7 @@ class ExtractionClient implements IExtractionClient { formData.append('tolerance_percent', config.tolerancePercent.toString()); const response = await axios.post( - `${this.baseUrl}/api/v1/forensics/analyze`, + `${this.baseUrl}/api/v1/forensics/analyze_docx`, formData, { headers: { diff --git a/backend/src/modules/rvw/controllers/reviewController.ts b/backend/src/modules/rvw/controllers/reviewController.ts index c6258959..be2d47b8 100644 --- a/backend/src/modules/rvw/controllers/reviewController.ts +++ b/backend/src/modules/rvw/controllers/reviewController.ts @@ -27,6 +27,17 @@ function getUserId(request: FastifyRequest): string { return userId; } +/** + * 获取租户ID(从JWT Token中获取) + */ +function getTenantId(request: FastifyRequest): string { + const tenantId = (request as any).user?.tenantId; + if (!tenantId) { + throw new Error('Tenant not found'); + } + return tenantId; +} + // ==================== 任务创建 ==================== /** @@ -43,7 +54,8 @@ export async function createTask( ) { try { const userId = getUserId(request); - logger.info('[RVW:Controller] 上传稿件', { userId }); + const tenantId = getTenantId(request); + logger.info('[RVW:Controller] 上传稿件', { userId, tenantId }); // 获取上传的文件 const data = await request.file(); @@ -105,7 +117,7 @@ export async function createTask( } // 创建任务 - const task = await reviewService.createTask(file, filename, userId, modelType); + const task = await reviewService.createTask(file, filename, userId, tenantId, modelType); logger.info('[RVW:Controller] 任务已创建', { taskId: task.id }); diff --git a/backend/src/modules/rvw/services/reviewService.ts b/backend/src/modules/rvw/services/reviewService.ts index 5f04f5d1..da4f97b2 100644 --- a/backend/src/modules/rvw/services/reviewService.ts +++ b/backend/src/modules/rvw/services/reviewService.ts @@ -17,6 +17,24 @@ import { ModelType } from '../../../common/llm/adapters/types.js'; import { logger } from '../../../common/logging/index.js'; import { jobQueue } from '../../../common/jobs/index.js'; import { Prisma } from '@prisma/client'; +import { storage } from '../../../common/storage/index.js'; +import { randomUUID } from 'crypto'; +import path from 'path'; + +/** + * 生成 RVW 模块的 OSS 存储 Key + * 格式: tenants/{tenantId}/users/{userId}/rvw/{taskId}/{filename} + */ +function generateRvwStorageKey( + tenantId: string, + userId: string, + taskId: string, + filename: string +): string { + const uuid = randomUUID().replace(/-/g, '').substring(0, 16); + const ext = path.extname(filename).toLowerCase(); + return `tenants/${tenantId}/users/${userId}/rvw/${taskId}/${uuid}${ext}`; +} import { AgentType, TaskStatus, @@ -44,6 +62,7 @@ import { * @param file 文件Buffer * @param filename 文件名 * @param userId 用户ID + * @param tenantId 租户ID * @param modelType 模型类型 * @returns 创建的任务 */ @@ -51,11 +70,12 @@ export async function createTask( file: Buffer, filename: string, userId: string, + tenantId: string, modelType: ModelType = 'deepseek-v3' ) { - logger.info('[RVW] 创建审查任务', { filename, userId, modelType }); + logger.info('[RVW] 创建审查任务', { filename, userId, tenantId, modelType }); - // 创建任务记录(状态为pending,等待用户选择智能体后运行) + // 1. 先创建任务记录获取 taskId const task = await prisma.reviewTask.create({ data: { userId, @@ -70,12 +90,37 @@ export async function createTask( logger.info('[RVW] 任务已创建', { taskId: task.id, status: task.status }); - // 异步提取文档文本(预处理,不运行评估) + // 2. 生成 OSS 存储 Key 并上传文件 + const storageKey = generateRvwStorageKey(tenantId, userId, task.id, filename); + let updatedTask = task; + + try { + logger.info('[RVW] 开始上传文件到存储', { taskId: task.id, storageKey }); + await storage.upload(storageKey, file); + logger.info('[RVW] 文件已上传到存储', { taskId: task.id, storageKey }); + + // 3. 更新任务的 filePath 字段 + updatedTask = await prisma.reviewTask.update({ + where: { id: task.id }, + data: { filePath: storageKey }, + }); + logger.info('[RVW] 任务 filePath 已更新', { taskId: task.id, filePath: storageKey }); + } catch (uploadError) { + logger.error('[RVW] 文件上传失败', { + taskId: task.id, + storageKey, + error: uploadError instanceof Error ? uploadError.message : 'Unknown error', + stack: uploadError instanceof Error ? uploadError.stack : undefined, + }); + // 上传失败不阻塞任务创建,DataForensicsSkill 会优雅降级 + } + + // 4. 异步提取文档文本(预处理,不运行评估) extractDocumentAsync(task.id, file, filename).catch(error => { logger.error('[RVW] 文档提取失败', { taskId: task.id, error: error.message }); }); - return task; + return updatedTask; } /** @@ -191,6 +236,7 @@ export async function runReview(params: RunReviewParams): Promise<{ jobId: strin agents, extractedText: task.extractedText, modelType: (task.modelUsed || 'deepseek-v3') as ModelType, + __expireInSeconds: 10 * 60, // 10分钟超时(审稿任务通常2-3分钟完成) }); logger.info('[RVW] 审查任务已推送到队列', { @@ -364,6 +410,10 @@ export async function getTaskReport(userId: string, taskId: string): Promise { ...initialContext, profile, previousResults: [], - } as TContext; + } as unknown as TContext; - logger.info({ + logger.info('[SkillExecutor] Starting pipeline execution', { taskId: context.taskId, profileId: profile.id, pipelineLength: profile.pipeline.length, - }, '[SkillExecutor] Starting pipeline execution'); + }); // 遍历 Pipeline for (const item of profile.pipeline) { // 跳过禁用的 Skill if (!item.enabled) { - logger.debug({ skillId: item.skillId }, '[SkillExecutor] Skill disabled, skipping'); + logger.debug('[SkillExecutor] Skill disabled, skipping', { skillId: item.skillId }); results.push(this.createSkippedResult(item.skillId, 'Skill disabled in profile')); continue; } @@ -78,20 +78,20 @@ export class SkillExecutor { // 获取 Skill const skill = SkillRegistry.get(item.skillId); if (!skill) { - logger.warn({ skillId: item.skillId }, '[SkillExecutor] Skill not found in registry'); + logger.warn('[SkillExecutor] Skill not found in registry', { skillId: item.skillId }); results.push(this.createSkippedResult(item.skillId, 'Skill not found')); continue; } // 前置检查 - if (skill.canRun && !skill.canRun(context as SkillContext)) { - logger.info({ skillId: item.skillId }, '[SkillExecutor] Skill pre-check failed, skipping'); + if (skill.canRun && !skill.canRun(context as unknown as SkillContext)) { + logger.info('[SkillExecutor] Skill pre-check failed, skipping', { skillId: item.skillId }); results.push(this.createSkippedResult(item.skillId, 'Pre-check failed')); continue; } // 执行 Skill - const result = await this.executeSkill(skill, context as SkillContext, item, profile); + const result = await this.executeSkill(skill, context as unknown as SkillContext, item, profile); results.push(result); // 调用完成回调(V2.1 扩展点) @@ -100,7 +100,7 @@ export class SkillExecutor { await this.config.onSkillComplete(item.skillId, result, context); } catch (callbackError: unknown) { const errorMessage = callbackError instanceof Error ? callbackError.message : String(callbackError); - logger.error({ skillId: item.skillId, error: errorMessage }, '[SkillExecutor] onSkillComplete callback failed'); + logger.error('[SkillExecutor] onSkillComplete callback failed', { skillId: item.skillId, error: errorMessage }); } } @@ -112,7 +112,7 @@ export class SkillExecutor { // 检查是否需要中断 if (result.status === 'error' && !this.shouldContinue(item, profile)) { - logger.warn({ skillId: item.skillId }, '[SkillExecutor] Skill failed and continueOnError=false, stopping'); + logger.warn('[SkillExecutor] Skill failed and continueOnError=false, stopping', { skillId: item.skillId }); break; } } @@ -120,13 +120,13 @@ export class SkillExecutor { // 生成汇总 const summary = this.buildSummary(context.taskId, profile.id, results, startTime); - logger.info({ + logger.info('[SkillExecutor] Pipeline execution completed', { taskId: context.taskId, overallStatus: summary.overallStatus, totalTime: summary.totalExecutionTime, successCount: summary.successCount, errorCount: summary.errorCount, - }, '[SkillExecutor] Pipeline execution completed'); + }); return summary; } @@ -144,23 +144,23 @@ export class SkillExecutor { const timeoutMultiplier = profile.globalConfig?.timeoutMultiplier ?? 1; const timeout = Math.round((item.timeout ?? skill.metadata.defaultTimeout ?? this.config.defaultTimeout) * timeoutMultiplier); - logger.info({ + logger.info('[SkillExecutor] Executing skill', { skillId: skill.metadata.id, taskId: context.taskId, timeout, - }, '[SkillExecutor] Executing skill'); + }); try { // 带超时执行 const result = await this.executeWithTimeout(skill, context, item.config, timeout); - logger.info({ + logger.info('[SkillExecutor] Skill execution completed', { skillId: skill.metadata.id, taskId: context.taskId, status: result.status, executionTime: result.executionTime, issueCount: result.issues.length, - }, '[SkillExecutor] Skill execution completed'); + }); return result; } catch (error: unknown) { @@ -169,11 +169,11 @@ export class SkillExecutor { // 判断是否超时 if (errorMessage === 'SKILL_TIMEOUT') { - logger.warn({ + logger.warn('[SkillExecutor] Skill execution timed out', { skillId: skill.metadata.id, taskId: context.taskId, timeout, - }, '[SkillExecutor] Skill execution timed out'); + }); return { skillId: skill.metadata.id, @@ -192,11 +192,11 @@ export class SkillExecutor { } // 其他错误 - logger.error({ + logger.error('[SkillExecutor] Skill execution failed', { skillId: skill.metadata.id, taskId: context.taskId, error: errorMessage, - }, '[SkillExecutor] Skill execution failed'); + }); return { skillId: skill.metadata.id, diff --git a/backend/src/modules/rvw/skills/core/profile.ts b/backend/src/modules/rvw/skills/core/profile.ts index 61aedd80..31546a46 100644 --- a/backend/src/modules/rvw/skills/core/profile.ts +++ b/backend/src/modules/rvw/skills/core/profile.ts @@ -7,7 +7,7 @@ * @since 2026-02-18 */ -import { JournalProfile, PipelineItem } from './types.js'; +import { JournalProfile } from './types.js'; import { logger } from '../../../../common/logging/index.js'; /** @@ -34,13 +34,13 @@ export const DEFAULT_PROFILE: JournalProfile = { skillId: 'EditorialSkill', enabled: true, optional: false, - timeout: 45000, + timeout: 180000, // 180 秒 }, { skillId: 'MethodologySkill', enabled: true, optional: false, - timeout: 45000, + timeout: 180000, // 180 秒 }, ], @@ -78,13 +78,13 @@ export const CHINESE_CORE_PROFILE: JournalProfile = { config: { standard: 'chinese-core', }, - timeout: 45000, + timeout: 180000, // 180 秒 }, { skillId: 'MethodologySkill', enabled: true, optional: false, - timeout: 45000, + timeout: 180000, // 180 秒 }, ], @@ -154,11 +154,11 @@ export class ProfileResolver { const profile = PROFILES.get(id); if (!profile) { - logger.warn({ profileId: id }, '[ProfileResolver] Profile not found, using default'); + logger.warn('[ProfileResolver] Profile not found, using default', { profileId: id }); return DEFAULT_PROFILE; } - logger.debug({ profileId: id }, '[ProfileResolver] Profile resolved'); + logger.debug('[ProfileResolver] Profile resolved', { profileId: id }); return profile; } @@ -196,10 +196,10 @@ export class ProfileResolver { enabled: enabledSkills.has(item.skillId), })); - logger.debug({ + logger.debug('[ProfileResolver] Profile built from agents', { selectedAgents, enabledSkills: Array.from(enabledSkills), - }, '[ProfileResolver] Profile built from agents'); + }); return baseProfile; } @@ -223,7 +223,7 @@ export class ProfileResolver { */ static register(profile: JournalProfile): void { PROFILES.set(profile.id, profile); - logger.info({ profileId: profile.id }, '[ProfileResolver] Profile registered'); + logger.info('[ProfileResolver] Profile registered', { profileId: profile.id }); } /** diff --git a/backend/src/modules/rvw/skills/core/registry.ts b/backend/src/modules/rvw/skills/core/registry.ts index f0796166..b642ea12 100644 --- a/backend/src/modules/rvw/skills/core/registry.ts +++ b/backend/src/modules/rvw/skills/core/registry.ts @@ -24,11 +24,11 @@ class SkillRegistryClass { const { id, version } = skill.metadata; if (this.skills.has(id)) { - logger.warn({ skillId: id }, '[SkillRegistry] Skill already registered, overwriting'); + logger.warn('[SkillRegistry] Skill already registered, overwriting', { skillId: id }); } this.skills.set(id, skill); - logger.info({ skillId: id, version }, '[SkillRegistry] Skill registered'); + logger.info('[SkillRegistry] Skill registered', { skillId: id, version }); } /** @@ -92,7 +92,7 @@ class SkillRegistryClass { unregister(id: string): boolean { const result = this.skills.delete(id); if (result) { - logger.info({ skillId: id }, '[SkillRegistry] Skill unregistered'); + logger.info('[SkillRegistry] Skill unregistered', { skillId: id }); } return result; } @@ -118,7 +118,7 @@ class SkillRegistryClass { */ markInitialized(): void { this.initialized = true; - logger.info({ skillCount: this.size }, '[SkillRegistry] Registry initialized'); + logger.info('[SkillRegistry] Registry initialized', { skillCount: this.size }); } /** diff --git a/backend/src/modules/rvw/skills/core/types.ts b/backend/src/modules/rvw/skills/core/types.ts index ff27ccab..67dff451 100644 --- a/backend/src/modules/rvw/skills/core/types.ts +++ b/backend/src/modules/rvw/skills/core/types.ts @@ -111,7 +111,7 @@ export interface ForensicsResult { * RVW 模块扩展字段 */ export interface RvwContextExtras { - documentPath: string; + documentPath?: string; // 可选:DataForensicsSkill 需要,Editorial/Methodology 不需要 documentContent: string; documentMeta?: DocumentMeta; tables?: TableData[]; diff --git a/backend/src/modules/rvw/skills/library/BaseSkill.ts b/backend/src/modules/rvw/skills/library/BaseSkill.ts index 8da0cbaa..980e4db0 100644 --- a/backend/src/modules/rvw/skills/library/BaseSkill.ts +++ b/backend/src/modules/rvw/skills/library/BaseSkill.ts @@ -20,6 +20,20 @@ import { } from '../core/types.js'; import { logger } from '../../../../common/logging/index.js'; +/** + * execute 方法的返回类型 + * 不需要包含 skillId, skillName, startedAt, completedAt, executionTime + * 这些字段由 BaseSkill.run() 自动填充 + */ +export type ExecuteResult = { + status: 'success' | 'warning' | 'error'; + score?: number; + scoreLabel?: string; + issues: SkillResult['issues']; + data?: unknown; + error?: string; +}; + /** * Skill 基类 * 使用泛型支持不同上下文和配置类型 @@ -39,11 +53,12 @@ export abstract class BaseSkill< /** * 子类实现具体逻辑 + * 返回值不需要包含 skillId, skillName, startedAt, completedAt, executionTime */ abstract execute( context: TContext, config?: TConfig - ): Promise>; + ): Promise; /** * 执行入口(统一处理日志、计时、配置验证等) @@ -52,10 +67,10 @@ export abstract class BaseSkill< const startedAt = new Date(); const startTime = Date.now(); - logger.info({ + logger.info(`[${this.metadata.id}] Starting execution`, { skillId: this.metadata.id, taskId: context.taskId, - }, `[${this.metadata.id}] Starting execution`); + }); try { // 配置验证(使用 Zod) @@ -64,13 +79,13 @@ export abstract class BaseSkill< const result = await this.execute(context, validatedConfig); const executionTime = Date.now() - startTime; - logger.info({ + logger.info(`[${this.metadata.id}] Execution completed`, { skillId: this.metadata.id, taskId: context.taskId, status: result.status, executionTime, issueCount: result.issues.length, - }, `[${this.metadata.id}] Execution completed`); + }); return { ...result, @@ -90,15 +105,15 @@ export abstract class BaseSkill< : SkillErrorCodes.SKILL_EXECUTION_ERROR; const errorMessage = isValidationError - ? `配置验证失败: ${(error as z.ZodError).errors.map(e => e.message).join(', ')}` + ? `配置验证失败: ${(error as z.ZodError).issues.map((e: z.ZodIssue) => e.message).join(', ')}` : `执行失败: ${error instanceof Error ? error.message : String(error)}`; - logger.error({ + logger.error(`[${this.metadata.id}] Execution failed`, { skillId: this.metadata.id, taskId: context.taskId, error: error instanceof Error ? error.message : String(error), errorType, - }, `[${this.metadata.id}] Execution failed`); + }); return { skillId: this.metadata.id, diff --git a/backend/src/modules/rvw/skills/library/DataForensicsSkill.ts b/backend/src/modules/rvw/skills/library/DataForensicsSkill.ts index b5f606fc..de282fcd 100644 --- a/backend/src/modules/rvw/skills/library/DataForensicsSkill.ts +++ b/backend/src/modules/rvw/skills/library/DataForensicsSkill.ts @@ -8,11 +8,10 @@ * @since 2026-02-18 */ -import { BaseSkill } from './BaseSkill.js'; +import { BaseSkill, ExecuteResult } from './BaseSkill.js'; import { SkillMetadata, SkillContext, - SkillResult, DataForensicsConfigSchema, DataForensicsConfig, ForensicsResult, @@ -23,19 +22,12 @@ import { IExtractionClient, ForensicsResult as ClientForensicsResult, } from '../../../../common/document/ExtractionClient.js'; +import { storage } from '../../../../common/storage/index.js'; import { logger } from '../../../../common/logging/index.js'; - -/** - * 安全:允许的文件存储路径前缀 - */ -const ALLOWED_PATH_PREFIXES = [ - '/app/uploads/', // Docker 容器内路径 - 'D:\\MyCursor\\', // 开发环境 Windows - 'D:/MyCursor/', // 开发环境 Windows (forward slash) - '/tmp/rvw-uploads/', // 临时目录 - 'C:\\Users\\', // Windows 用户目录 - '/home/', // Linux 用户目录 -]; +import * as fs from 'fs/promises'; +import * as path from 'path'; +import * as os from 'os'; +import { randomUUID } from 'crypto'; /** * 数据侦探 Skill @@ -76,39 +68,35 @@ export class DataForensicsSkill extends BaseSkill { - const normalizedPrefix = prefix.replace(/\\/g, '/'); - return normalizedPath.startsWith(normalizedPrefix); - }); - - if (!isPathAllowed) { - logger.error({ + // 安全检查:OSS key 格式验证(tenants/xxx/users/xxx/rvw/xxx/xxx.docx) + const isOssKey = context.documentPath.startsWith('tenants/') || + context.documentPath.startsWith('temp/'); + + if (!isOssKey) { + logger.warn('[DataForensicsSkill] Invalid storage key format', { taskId: context.taskId, - documentPath: '[REDACTED]', // 不记录完整路径 - }, '[DataForensicsSkill] Document path not in allowed prefixes (security check)'); + }); return false; } // 检查是否包含路径遍历 if (context.documentPath.includes('..')) { - logger.error({ + logger.error('[DataForensicsSkill] Path traversal detected (security check)', { taskId: context.taskId, - }, '[DataForensicsSkill] Path traversal detected (security check)'); + }); return false; } @@ -117,23 +105,46 @@ export class DataForensicsSkill extends BaseSkill> { + ): Promise { const checkLevel = config?.checkLevel || 'L1_L2_L25'; const tolerancePercent = config?.tolerancePercent || 0.1; + const storageKey = context.documentPath!; - logger.info({ + logger.info('[DataForensicsSkill] Starting analysis', { taskId: context.taskId, + storageKey, checkLevel, tolerancePercent, - }, '[DataForensicsSkill] Starting analysis'); + }); + + // 创建临时文件路径 + const tempDir = os.tmpdir(); + const tempFilename = `rvw-${randomUUID()}.docx`; + const tempFilePath = path.join(tempDir, tempFilename); try { - // 使用依赖注入的 client - const result = await this.extractionClient.analyzeDocx(context.documentPath, { + // 1. 从 OSS 下载文件到临时目录 + logger.info('[DataForensicsSkill] Downloading file from storage', { + taskId: context.taskId, + storageKey, + tempFilePath, + }); + + const fileBuffer = await storage.download(storageKey); + await fs.writeFile(tempFilePath, fileBuffer); + + logger.info('[DataForensicsSkill] File downloaded successfully', { + taskId: context.taskId, + fileSize: fileBuffer.length, + }); + + // 2. 调用 Python 服务分析临时文件 + const result = await this.extractionClient.analyzeDocx(tempFilePath, { checkLevel, tolerancePercent, }); @@ -159,13 +170,13 @@ export class DataForensicsSkill extends BaseSkill ({ - severity: issue.severity, - type: issue.type, - message: issue.message, - location: issue.location, - evidence: issue.evidence, - })); + // 防御性检查 + const rawTables = result.tables || []; + + // Python 返回的是 methodsFound(驼峰),也可能是 methods + const rawMethods = (result as any).methodsFound || result.methods || []; + + // 从 tables[].issues 中收集所有 issues + const allIssues: Issue[] = []; + for (const table of rawTables) { + const tableIssues = (table as any).issues || []; + for (const issue of tableIssues) { + allIssues.push({ + severity: issue.severity, + type: issue.type, + message: issue.message, + location: issue.location, + evidence: issue.evidence, + }); + } + } + + // 也检查顶层的 issues(兼容旧格式) + const topLevelIssues = result.issues || []; + for (const issue of topLevelIssues) { + allIssues.push({ + severity: issue.severity, + type: issue.type, + message: issue.message, + location: issue.location, + evidence: issue.evidence, + }); + } + + // 构建 summary(从 Python 返回的顶层字段或 summary 对象) + const pyResult = result as any; + const summary = result.summary || { + totalTables: pyResult.totalTables ?? rawTables.length, + totalIssues: pyResult.totalIssues ?? allIssues.length, + errorCount: pyResult.errorCount ?? allIssues.filter(i => i.severity === 'ERROR').length, + warningCount: pyResult.warningCount ?? allIssues.filter(i => i.severity === 'WARNING').length, + }; return { - tables: result.tables.map(t => ({ - id: t.id, - caption: t.caption, - data: t.data, - html: t.html, - headers: t.headers, - rowCount: t.rowCount, - colCount: t.colCount, - })), - methods: result.methods, - issues, + tables: rawTables.map(t => { + const tableIssues = ((t as any).issues || []).map((issue: any) => ({ + severity: issue.severity, + type: issue.type, + message: issue.message, + location: issue.location, + evidence: issue.evidence, + })); + return { + id: t.id || '', + caption: t.caption || '', + data: t.data || [], + html: t.html || '', + headers: t.headers || [], + rowCount: t.rowCount || 0, + colCount: t.colCount || 0, + issues: tableIssues, // 保留每个表格的 issues + }; + }), + methods: rawMethods, + issues: allIssues, summary: { - totalTables: result.summary.totalTables, - totalIssues: result.summary.totalIssues, - errorCount: result.summary.errorCount, - warningCount: result.summary.warningCount, + totalTables: summary.totalTables ?? rawTables.length, + totalIssues: summary.totalIssues ?? allIssues.length, + errorCount: summary.errorCount ?? 0, + warningCount: summary.warningCount ?? 0, }, }; } diff --git a/backend/src/modules/rvw/skills/library/EditorialSkill.ts b/backend/src/modules/rvw/skills/library/EditorialSkill.ts index 4085b785..66a07419 100644 --- a/backend/src/modules/rvw/skills/library/EditorialSkill.ts +++ b/backend/src/modules/rvw/skills/library/EditorialSkill.ts @@ -8,17 +8,16 @@ * @since 2026-02-18 */ -import { BaseSkill } from './BaseSkill.js'; +import { BaseSkill, ExecuteResult } from './BaseSkill.js'; import { SkillMetadata, SkillContext, - SkillResult, EditorialConfigSchema, EditorialConfig, Issue, } from '../core/types.js'; import { reviewEditorialStandards } from '../../services/editorialService.js'; -import { EditorialReview, EditorialItem } from '../../types/index.js'; +import { EditorialReview } from '../../types/index.js'; import { logger } from '../../../../common/logging/index.js'; /** @@ -45,7 +44,7 @@ export class EditorialSkill extends BaseSkill { inputs: ['documentContent'], outputs: ['editorialResult'], - defaultTimeout: 45000, // 45 秒 + defaultTimeout: 180000, // 180 秒(LLM 调用可能较慢) retryable: true, icon: '📋', @@ -57,18 +56,18 @@ export class EditorialSkill extends BaseSkill { */ canRun(context: SkillContext): boolean { if (!context.documentContent || context.documentContent.trim().length === 0) { - logger.warn({ taskId: context.taskId }, '[EditorialSkill] No document content'); + logger.warn('[EditorialSkill] No document content', { taskId: context.taskId }); return false; } // 资源限制检查 const maxLength = DEFAULT_MAX_CONTENT_LENGTH; if (context.documentContent.length > maxLength) { - logger.warn({ + logger.warn('[EditorialSkill] Content too long', { taskId: context.taskId, contentLength: context.documentContent.length, limit: maxLength, - }, '[EditorialSkill] Content too long'); + }); return false; } @@ -81,23 +80,23 @@ export class EditorialSkill extends BaseSkill { async execute( context: SkillContext, config?: EditorialConfig - ): Promise> { + ): Promise { const maxContentLength = config?.maxContentLength || DEFAULT_MAX_CONTENT_LENGTH; - logger.info({ + logger.info('[EditorialSkill] Starting evaluation', { taskId: context.taskId, contentLength: context.documentContent.length, - }, '[EditorialSkill] Starting evaluation'); + }); // 截断过长内容 let content = context.documentContent; if (content.length > maxContentLength) { content = content.substring(0, maxContentLength); - logger.warn({ + logger.warn('[EditorialSkill] Content truncated', { taskId: context.taskId, originalLength: context.documentContent.length, truncatedLength: maxContentLength, - }, '[EditorialSkill] Content truncated'); + }); } // 调用现有 editorialService @@ -119,13 +118,13 @@ export class EditorialSkill extends BaseSkill { status = 'success'; } - logger.info({ + logger.info('[EditorialSkill] Evaluation completed', { taskId: context.taskId, score: result.overall_score, itemCount: result.items.length, errorCount, warningCount, - }, '[EditorialSkill] Evaluation completed'); + }); return { status, diff --git a/backend/src/modules/rvw/skills/library/MethodologySkill.ts b/backend/src/modules/rvw/skills/library/MethodologySkill.ts index b05d97bb..75c9ac3a 100644 --- a/backend/src/modules/rvw/skills/library/MethodologySkill.ts +++ b/backend/src/modules/rvw/skills/library/MethodologySkill.ts @@ -8,17 +8,16 @@ * @since 2026-02-18 */ -import { BaseSkill } from './BaseSkill.js'; +import { BaseSkill, ExecuteResult } from './BaseSkill.js'; import { SkillMetadata, SkillContext, - SkillResult, MethodologyConfigSchema, MethodologyConfig, Issue, } from '../core/types.js'; import { reviewMethodology } from '../../services/methodologyService.js'; -import { MethodologyReview, MethodologyIssue } from '../../types/index.js'; +import { MethodologyReview } from '../../types/index.js'; import { logger } from '../../../../common/logging/index.js'; /** @@ -45,7 +44,7 @@ export class MethodologySkill extends BaseSkill inputs: ['documentContent', 'methods'], outputs: ['methodologyResult'], - defaultTimeout: 45000, // 45 秒 + defaultTimeout: 180000, // 180 秒(方法学分析需要更长时间) retryable: true, icon: '🔬', @@ -57,18 +56,18 @@ export class MethodologySkill extends BaseSkill */ canRun(context: SkillContext): boolean { if (!context.documentContent || context.documentContent.trim().length === 0) { - logger.warn({ taskId: context.taskId }, '[MethodologySkill] No document content'); + logger.warn('[MethodologySkill] No document content', { taskId: context.taskId }); return false; } // 资源限制检查 const maxLength = DEFAULT_MAX_CONTENT_LENGTH; if (context.documentContent.length > maxLength) { - logger.warn({ + logger.warn('[MethodologySkill] Content too long', { taskId: context.taskId, contentLength: context.documentContent.length, limit: maxLength, - }, '[MethodologySkill] Content too long'); + }); return false; } @@ -81,34 +80,34 @@ export class MethodologySkill extends BaseSkill async execute( context: SkillContext, config?: MethodologyConfig - ): Promise> { + ): Promise { const maxContentLength = config?.maxContentLength || DEFAULT_MAX_CONTENT_LENGTH; - logger.info({ + logger.info('[MethodologySkill] Starting evaluation', { taskId: context.taskId, contentLength: context.documentContent.length, detectedMethods: context.methods?.length || 0, - }, '[MethodologySkill] Starting evaluation'); + }); // 截断过长内容 let content = context.documentContent; if (content.length > maxContentLength) { content = content.substring(0, maxContentLength); - logger.warn({ + logger.warn('[MethodologySkill] Content truncated', { taskId: context.taskId, originalLength: context.documentContent.length, truncatedLength: maxContentLength, - }, '[MethodologySkill] Content truncated'); + }); } // 如果 DataForensicsSkill 提取了统计方法,可以添加到 prompt 中 // 目前 reviewMethodology 不支持此参数,留作未来扩展 const methodsHint = context.methods?.join(', ') || ''; if (methodsHint) { - logger.debug({ + logger.debug('[MethodologySkill] Using detected methods as hint', { taskId: context.taskId, methodsHint, - }, '[MethodologySkill] Using detected methods as hint'); + }); } // 调用现有 methodologyService @@ -130,13 +129,13 @@ export class MethodologySkill extends BaseSkill status = 'success'; } - logger.info({ + logger.info('[MethodologySkill] Evaluation completed', { taskId: context.taskId, score: result.overall_score, partCount: result.parts.length, errorCount, warningCount, - }, '[MethodologySkill] Evaluation completed'); + }); return { status, diff --git a/backend/src/modules/rvw/skills/test-skills.ts b/backend/src/modules/rvw/skills/test-skills.ts new file mode 100644 index 00000000..d71b75fe --- /dev/null +++ b/backend/src/modules/rvw/skills/test-skills.ts @@ -0,0 +1,103 @@ +/** + * RVW Skills 架构 - 快速验证脚本 + * + * 运行方式: npx tsx src/modules/rvw/skills/test-skills.ts + */ + +import { SkillRegistry } from './core/registry.js'; +import { ProfileResolver, DEFAULT_PROFILE } from './core/profile.js'; +import { ContextBuilder } from './core/context.js'; +import { SkillExecutor } from './core/executor.js'; +import { registerBuiltinSkills } from './library/index.js'; + +// 注册内置 Skills +registerBuiltinSkills(); + +async function main() { + console.log('='.repeat(60)); + console.log('RVW Skills V2.0 架构验证'); + console.log('='.repeat(60)); + + // 1. 测试 SkillRegistry + console.log('\n📋 1. SkillRegistry 验证'); + console.log('-'.repeat(40)); + + const summary = SkillRegistry.getSummary(); + console.log(` 已初始化: ${summary.initialized}`); + console.log(` 注册 Skills 数量: ${summary.skillCount}`); + console.log(` 分类统计:`, summary.categories); + + const allSkills = SkillRegistry.getAllMetadata(); + console.log('\n 已注册的 Skills:'); + for (const skill of allSkills) { + console.log(` - ${skill.id} (${skill.name}) v${skill.version}`); + } + + // 2. 测试 ProfileResolver + console.log('\n📋 2. ProfileResolver 验证'); + console.log('-'.repeat(40)); + + const defaultProfile = ProfileResolver.resolve('default'); + console.log(` 默认 Profile: ${defaultProfile.name}`); + console.log(` Pipeline 长度: ${defaultProfile.pipeline.length}`); + console.log(` Pipeline Skills:`); + for (const item of defaultProfile.pipeline) { + console.log(` - ${item.skillId} (enabled: ${item.enabled}, optional: ${item.optional})`); + } + + // 测试动态 Profile + const dynamicProfile = ProfileResolver.resolveFromAgents(['editorial', 'methodology']); + console.log(`\n 动态 Profile (editorial + methodology):`); + const enabledSkills = dynamicProfile.pipeline.filter(p => p.enabled); + console.log(` 启用的 Skills: ${enabledSkills.map(p => p.skillId).join(', ')}`); + + // 3. 测试 ContextBuilder + console.log('\n📋 3. ContextBuilder 验证'); + console.log('-'.repeat(40)); + + const context = new ContextBuilder() + .taskId('test-task-123') + .userId('test-user-456') + .documentPath('D:/MyCursor/test/document.docx') // 使用允许的路径前缀 + .documentContent('这是一篇测试论文的内容...') + .profile(defaultProfile) + .build(); + + console.log(` taskId: ${context.taskId}`); + console.log(` userId: ${context.userId}`); + console.log(` documentPath: ${context.documentPath}`); + console.log(` documentContent 长度: ${context.documentContent.length}`); + + // 4. 测试 canRun 检查 + console.log('\n📋 4. Skill canRun 检查'); + console.log('-'.repeat(40)); + + for (const skill of SkillRegistry.getAll()) { + const canRun = skill.canRun ? skill.canRun(context) : true; + console.log(` ${skill.metadata.id}: canRun = ${canRun}`); + } + + // 5. 验证总结 + console.log('\n' + '='.repeat(60)); + console.log('✅ Skills 架构核心组件验证完成!'); + console.log('='.repeat(60)); + + // 检查是否有问题 + if (summary.skillCount < 3) { + console.log('\n⚠️ 警告: 注册的 Skills 数量少于预期 (预期 3 个)'); + } + + if (!SkillRegistry.has('DataForensicsSkill')) { + console.log('⚠️ 警告: DataForensicsSkill 未注册'); + } + if (!SkillRegistry.has('EditorialSkill')) { + console.log('⚠️ 警告: EditorialSkill 未注册'); + } + if (!SkillRegistry.has('MethodologySkill')) { + console.log('⚠️ 警告: MethodologySkill 未注册'); + } + + console.log('\n下一步: 启动后端服务,通过 API 测试完整流程'); +} + +main().catch(console.error); diff --git a/backend/src/modules/rvw/types/index.ts b/backend/src/modules/rvw/types/index.ts index a75168c6..f244bec4 100644 --- a/backend/src/modules/rvw/types/index.ts +++ b/backend/src/modules/rvw/types/index.ts @@ -65,6 +65,45 @@ export interface MethodologyReview { parts: MethodologyPart[]; } +// ==================== 数据验证(DataForensics) ==================== + +export interface ForensicsIssue { + severity: 'ERROR' | 'WARNING' | 'INFO'; + type: string; + message: string; + location?: { + tableId?: string; + cellRef?: string; + paragraph?: number; + }; + evidence?: Record; +} + +export interface ForensicsTable { + id: string; + caption: string; + html: string; + data: string[][]; + headers: string[]; + rowCount: number; + colCount: number; + skipped?: boolean; + skipReason?: string; + issues: ForensicsIssue[]; +} + +export interface ForensicsResult { + tables: ForensicsTable[]; + methods: string[]; + issues: ForensicsIssue[]; + summary: { + totalTables: number; + totalIssues: number; + errorCount: number; + warningCount: number; + }; +} + // ==================== 请求参数 ==================== /** @@ -142,6 +181,7 @@ export interface ReviewReport { overallScore?: number; editorialReview?: EditorialReview; methodologyReview?: MethodologyReview; + forensicsResult?: ForensicsResult; completedAt?: Date; durationSeconds?: number; } diff --git a/backend/src/modules/rvw/workers/reviewWorker.ts b/backend/src/modules/rvw/workers/reviewWorker.ts index e79d7c0f..8caaf0c0 100644 --- a/backend/src/modules/rvw/workers/reviewWorker.ts +++ b/backend/src/modules/rvw/workers/reviewWorker.ts @@ -65,16 +65,50 @@ function ensureSkillsInitialized() { } } +/** + * 清理卡住的任务(启动时调用) + * 当服务重启时,之前正在执行的任务会卡在 'reviewing' 状态 + */ +async function cleanupStuckTasks(): Promise { + try { + const stuckTasks = await prisma.reviewTask.updateMany({ + where: { + status: { + in: ['reviewing', 'reviewing_editorial', 'reviewing_methodology'], + }, + }, + data: { + status: 'failed', + errorMessage: '服务重启导致任务中断,请重新提交', + }, + }); + + if (stuckTasks.count > 0) { + logger.warn('[reviewWorker] Cleaned up stuck tasks on startup', { + count: stuckTasks.count, + }); + console.log(`⚠️ 启动时清理了 ${stuckTasks.count} 个卡住的任务`); + } + } catch (error) { + logger.error('[reviewWorker] Failed to cleanup stuck tasks', { + error: error instanceof Error ? error.message : String(error), + }); + } +} + /** * 注册审查 Worker 到队列 * * 此函数应在应用启动时调用(index.ts) */ -export function registerReviewWorker() { +export async function registerReviewWorker() { logger.info('[reviewWorker] Registering reviewWorker', { useSkillsArchitecture: USE_SKILLS_ARCHITECTURE, }); + // 清理卡住的任务 + await cleanupStuckTasks(); + // 初始化 Skills ensureSkillsInitialized(); @@ -113,6 +147,15 @@ export function registerReviewWorker() { }, }); + // 调试日志:检查 filePath + logger.info('[reviewWorker] Task info from DB', { + taskId, + filePath: existingTask?.filePath || '(empty)', + fileName: existingTask?.fileName, + fileSize: existingTask?.fileSize, + }); + console.log(` 📁 filePath: ${existingTask?.filePath || '(空)'}`); + if (existingTask?.status === 'completed' && existingTask.completedAt) { logger.warn('[reviewWorker] ⚠️ Task already completed, skipping', { jobId: job.id, @@ -223,8 +266,7 @@ export function registerReviewWorker() { // ======================================== logger.info('[reviewWorker] Updating task result', { taskId }); - // 构建 Skills 执行摘要(V2.0 新增,存储到 picoExtract 字段) - // 注意:picoExtract 字段暂时复用,未来迁移后移到专用字段 + // 构建 Skills 执行摘要(V2.0 新增,存储到专用 contextData 字段) const skillsContext = USE_SKILLS_ARCHITECTURE && skillsSummary ? { version: '2.0', @@ -246,7 +288,7 @@ export function registerReviewWorker() { status: 'completed', editorialReview: editorialResult as unknown as Prisma.InputJsonValue ?? Prisma.JsonNull, methodologyReview: methodologyResult as unknown as Prisma.InputJsonValue ?? Prisma.JsonNull, - picoExtract: skillsContext as unknown as Prisma.InputJsonValue ?? Prisma.JsonNull, + contextData: skillsContext as unknown as Prisma.InputJsonValue ?? Prisma.JsonNull, overallScore, editorialScore: editorialScore, methodologyScore: methodologyScore, diff --git a/docs/00-系统总体设计/00-系统当前状态与开发指南.md b/docs/00-系统总体设计/00-系统当前状态与开发指南.md index 9f79ca73..3476a37f 100644 --- a/docs/00-系统总体设计/00-系统当前状态与开发指南.md +++ b/docs/00-系统总体设计/00-系统当前状态与开发指南.md @@ -1,10 +1,11 @@ # AIclinicalresearch 系统当前状态与开发指南 -> **文档版本:** v5.1 +> **文档版本:** v5.2 > **创建日期:** 2025-11-28 > **维护者:** 开发团队 > **最后更新:** 2026-02-18 > **🎉 重大里程碑:** +> - **2026-02-18:RVW V2.0 Week 3 完成!** 统计验证扩展 + 负号归一化 + 文件格式提示 + 用户体验优化 > - **2026-02-18:RVW V2.0 Skills 架构完成!** Skills 核心框架 + 3个 Skill 实现 + ReviewWorker 改造 > - **2026-02-17:RVW V2.0 "数据侦探" Day 6 完成!** L2统计验证器 + L2.5一致性取证(SE三角验证、SD>Mean) > - **2026-02-08:IIT 事件级质控 V3.1 开发完成!** record+event 独立质控 + 规则动态过滤 + 报告去重 + AI对话增强 @@ -18,13 +19,14 @@ > - **2026-01-24:Protocol Agent 框架完成!** 可复用Agent框架+5阶段对话流程 > - **2026-01-22:OSS 存储集成完成!** 阿里云 OSS 正式接入平台基础层 > -> **最新进展(RVW V2.0 Skills 架构 2026-02-18):** +> **最新进展(RVW V2.0 Week 3 完成 2026-02-18):** +> - ✅ **负号归一化**:6 种 Unicode 负号变体支持,防止 float() 崩溃 +> - ✅ **T 检验验证增强**:智能样本量提取 + subrow 精确高亮 +> - ✅ **SE 三角/CI-P 验证增强**:多行单元格 subrow 支持 +> - ✅ **前端翻译映射更新**:6 种新 IssueType 中文翻译 +> - ✅ **文件格式提示**:PDF/.doc 上传时提示无法数据验证 > - ✅ **Skills 核心框架**:types、registry、executor、profile、context -> - ✅ **Zod 配置验证**:运行时类型安全 -> - ✅ **DataForensicsSkill**:依赖注入 + 路径安全 + 优雅降级 -> - ✅ **EditorialSkill + MethodologySkill**:封装现有服务 -> - ✅ **ReviewWorker 改造**:集成 SkillExecutor,支持 V1/V2 架构切换 -> - ✅ **12 个新文件**:约 1735 行代码 +> - ✅ **3 个 Skill 实现**:DataForensics、Editorial、Methodology > > **部署状态:** ✅ 生产环境运行中 | 公网地址:http://8.140.53.236/ > **REDCap 状态:** ✅ 生产环境运行中 | 地址:https://redcap.xunzhengyixue.com/ @@ -67,7 +69,7 @@ | **IIT** | IIT Manager Agent | AI驱动IIT研究助手 - 双脑架构+REDCap集成 | ⭐⭐⭐⭐⭐ | 🎉 **事件级质控V3.1完成(设计100%,代码60%)** | **P0** | | **SSA** | 智能统计分析 | 队列/预测模型/RCT分析 | ⭐⭐⭐⭐⭐ | 📋 规划中 | P2 | | **ST** | 统计分析工具 | 100+轻量化统计工具 | ⭐⭐⭐⭐ | 📋 规划中 | P2 | -| **RVW** | 稿件审查系统 | 方法学评估 + 🆕数据侦探(L1/L2/L2.5验证)+ Skills架构 + Word导出 | ⭐⭐⭐⭐ | 🚀 **V2.0开发中(Week2 Day10完成)** - Skills核心框架+Skill实现+Worker改造 | P1 | +| **RVW** | 稿件审查系统 | 方法学评估 + 🆕数据侦探(L1/L2/L2.5验证)+ Skills架构 + Word导出 | ⭐⭐⭐⭐ | 🚀 **V2.0 Week3完成(85%)** - 统计验证扩展+负号归一化+文件格式提示+用户体验优化 | P1 | | **ADMIN** | 运营管理端 | Prompt管理、租户管理、用户管理、运营监控、系统知识库 | ⭐⭐⭐⭐⭐ | 🎉 **Phase 4.6完成(88%)** - Prompt知识库集成+动态注入 | **P0** | --- diff --git a/docs/03-业务模块/RVW-稿件审查系统/00-模块当前状态与开发指南.md b/docs/03-业务模块/RVW-稿件审查系统/00-模块当前状态与开发指南.md index 85f99920..6f1967bc 100644 --- a/docs/03-业务模块/RVW-稿件审查系统/00-模块当前状态与开发指南.md +++ b/docs/03-业务模块/RVW-稿件审查系统/00-模块当前状态与开发指南.md @@ -1,21 +1,30 @@ # RVW稿件审查模块 - 当前状态与开发指南 -> **文档版本:** v5.0 +> **文档版本:** v5.1 > **创建日期:** 2026-01-07 > **最后更新:** 2026-02-18 > **维护者:** 开发团队 -> **当前状态:** 🚀 **V2.0 "数据侦探" 开发中(Week 2 Day 10 完成)** +> **当前状态:** 🚀 **V2.0 "数据侦探" Week 3 完成(统计验证扩展+用户体验优化)** > **文档目的:** 快速了解RVW模块状态,为新AI助手提供上下文 > -> **🎉 V2.0 进展(2026-02-18):** +> **🎉 V2.0 进展(2026-02-18 Week 3):** +> - ✅ **负号归一化**:防止 float() 崩溃,覆盖 6 种负号变体 +> - ✅ **T 检验验证增强**:智能样本量提取 + subrow 精确高亮 +> - ✅ **SE 三角验证增强**:多行单元格 subrow 支持 +> - ✅ **CI vs P 值验证增强**:subrow 支持 + 灵活 P 值解析 +> - ✅ **前端翻译映射**:新增 6 种 IssueType 中文翻译 +> - ✅ **文件格式提示**:PDF/.doc 上传时提示无法数据验证 +> +> **🎉 V2.0 进展(Week 1-2):** > - ✅ **L1 算术验证器**:行列加总、百分比验证(Day 3) -> - ✅ **L2 统计验证器**:CI↔P 值一致性、T检验逆向验证(Day 6) +> - ✅ **L2 统计验证器**:CI↔P 值一致性、卡方检验逆向验证(Day 6) > - ✅ **L2.5 一致性取证**:SE三角验证、SD>Mean检查(Day 6 终审提权) -> - ✅ **Word 文档解析**:python-docx 表格提取(Day 2) +> - ✅ **Word 文档解析**:python-docx 表格提取 + 特殊符号提取(Day 2) > - ✅ **Skills 核心框架**:types、registry、executor、profile、context(Day 7) -> - ✅ **DataForensicsSkill**:依赖注入、路径安全、优雅降级(Day 8) +> - ✅ **DataForensicsSkill**:OSS 集成、依赖注入、优雅降级(Day 8) > - ✅ **EditorialSkill + MethodologySkill**:封装现有服务(Day 9) > - ✅ **ReviewWorker 改造**:集成 SkillExecutor,支持 V1/V2 切换(Day 10) +> - ✅ **前端数据验证 Tab**:ForensicsReport 组件、精确单元格高亮(Week 3) --- @@ -377,37 +386,50 @@ Content-Type: multipart/form-data | 阶段 | 任务 | 状态 | 完成日期 | |------|------|------|---------| | Week 1 Day 1 | Python 服务搭建 | ✅ 已完成 | 2026-02-12 | -| Week 1 Day 2 | Word 表格提取 | ✅ 已完成 | 2026-02-13 | +| Week 1 Day 2 | Word 表格提取 + 特殊符号 | ✅ 已完成 | 2026-02-13 | | Week 1 Day 3 | L1 算术验证器 | ✅ 已完成 | 2026-02-14 | | Week 1 Day 4 | 数据结构设计 | ✅ 已完成 | 2026-02-15 | | Week 1 Day 5 | API 集成 | ✅ 已完成 | 2026-02-16 | -| **Week 2 Day 6** | **L2 统计验证器 + L2.5 一致性取证** | **✅ 已完成** | **2026-02-17** | -| Week 2 Day 7 | Skills 核心框架 | 📋 待开发 | - | -| Week 2 Day 8 | DataForensicsSkill | 📋 待开发 | - | -| Week 2 Day 9 | EditorialSkill 封装 | 📋 待开发 | - | -| Week 2 Day 10 | ReviewService 改造 | 📋 待开发 | - | +| Week 2 Day 6 | L2 统计验证器 + L2.5 一致性取证 | ✅ 已完成 | 2026-02-17 | +| Week 2 Day 7 | Skills 核心框架 | ✅ 已完成 | 2026-02-18 | +| Week 2 Day 8 | DataForensicsSkill | ✅ 已完成 | 2026-02-18 | +| Week 2 Day 9 | EditorialSkill 封装 | ✅ 已完成 | 2026-02-18 | +| Week 2 Day 10 | ReviewWorker 改造 | ✅ 已完成 | 2026-02-18 | +| **Week 3** | **统计验证扩展 + 用户体验优化** | **✅ 已完成** | **2026-02-18** | +| Week 4 | 功能测试 + Bug 修复 | 📋 待开始 | - | -**V2.0 核心功能**: +**V2.0 核心功能(已完成)**: - **L1 算术验证**:行列加总、百分比验证 -- **L2 统计验证**:CI↔P 一致性、T检验逆向、卡方检验 +- **L2 统计验证**:CI↔P 一致性、T检验逆向、卡方检验(含 subrow 精确高亮) - **L2.5 一致性取证**(终审提权):SE三角验证、SD>Mean检查 - **Skills 架构**:Skill Registry、Skill Executor、Journal Profiles +- **负号归一化**:6 种 Unicode 负号变体支持 +- **文件格式提示**:PDF/.doc 无法数据验证的用户提示 + +**Week 3 完成内容(2026-02-18)**: +- ✅ 负号归一化(防止 float() 崩溃) +- ✅ T 检验验证增强(智能样本量提取) +- ✅ SE 三角验证增强(subrow 支持) +- ✅ CI vs P 值验证增强(subrow 支持) +- ✅ 前端翻译映射更新(6 种新 IssueType) +- ✅ 文件格式提示(Header、ReportDetail、TaskDetail) ### 后续版本(V2.1+) +- [ ] Week 4 功能测试和 Bug 修复 +- [ ] ANOVA 验证(多组比较) +- [ ] 配对 T 检验验证 +- [ ] 非参数检验(Mann-Whitney、Wilcoxon) +- [ ] .doc 格式支持(Pandoc 方案评估) +- [ ] Profile 管理 UI(期刊配置界面) - [ ] PDF报告导出优化 - [ ] PICO卡片UI实现 - [ ] 历史归档UI实现 -- [ ] L3 高级逻辑推理验证 -- [ ] 登录页面(独立产品时) -- [ ] 审稿人管理系统 -- [ ] 多轮审稿流程 -- [ ] 期刊库管理 - [ ] 独立产品打包 --- -**文档版本:** v3.2 -**最后更新:** 2026-01-10 -**当前状态:** ✅ Phase 1-6 完成,模块95%可用,Schema已隔离 -**下一步:** 生产环境部署测试 +**文档版本:** v5.1 +**最后更新:** 2026-02-18 +**当前状态:** 🚀 V2.0 "数据侦探" Week 3 完成,Skills 架构 + 统计验证 + 用户体验优化 +**下一步:** Week 4 功能测试和 Bug 修复 diff --git a/docs/03-业务模块/RVW-稿件审查系统/05-测试文档/Test_刘锦_2019—2022年昆明市二、三级医院卒中中心急性缺血性卒中静脉溶栓指标分析_定稿0314 - 副本.docx b/docs/03-业务模块/RVW-稿件审查系统/05-测试文档/Test_刘锦_2019—2022年昆明市二、三级医院卒中中心急性缺血性卒中静脉溶栓指标分析_定稿0314 - 副本.docx new file mode 100644 index 00000000..2d7c7fa2 Binary files /dev/null and b/docs/03-业务模块/RVW-稿件审查系统/05-测试文档/Test_刘锦_2019—2022年昆明市二、三级医院卒中中心急性缺血性卒中静脉溶栓指标分析_定稿0314 - 副本.docx differ diff --git a/docs/03-业务模块/RVW-稿件审查系统/06-开发记录/2026-02-18 统计验证扩展与用户体验优化.md b/docs/03-业务模块/RVW-稿件审查系统/06-开发记录/2026-02-18 统计验证扩展与用户体验优化.md new file mode 100644 index 00000000..0bf8b430 --- /dev/null +++ b/docs/03-业务模块/RVW-稿件审查系统/06-开发记录/2026-02-18 统计验证扩展与用户体验优化.md @@ -0,0 +1,184 @@ +# RVW V2.0 开发记录 - 2026-02-18 + +> **日期:** 2026-02-18 +> **阶段:** Week 3 - 统计验证扩展与用户体验优化 +> **开发者:** AI Assistant +> **状态:** ✅ 完成 + +--- + +## 📋 今日完成内容 + +### 1. 负号归一化功能 ✅ + +**问题背景:** +- Word 文档中的负号可能是多种 Unicode 字符(数学减号 `\u2212`、En Dash `\u2013`、Em Dash `\u2014` 等) +- Python 的 `float()` 无法解析这些特殊字符,导致验证失败 + +**实现内容:** + +| 文件 | 修改 | +|------|------| +| `extraction_service/forensics/extractor.py` | 新增 `_clean_statistical_text()` 方法,在提取单元格时自动清洗 | +| `extraction_service/forensics/validator.py` | 新增 `_clean_number_string()` 和 `_safe_float()` 辅助函数 | + +**覆盖的特殊字符:** + +| Unicode | 字符 | 名称 | 清洗为 | +|---------|------|------|--------| +| `\u2212` | − | 数学减号 | `-` | +| `\u2013` | – | En Dash | `-` | +| `\u2014` | — | Em Dash | `-` | +| `\u2264` | ≤ | 小于等于 | `<=` | +| `\u2265` | ≥ | 大于等于 | `>=` | +| `\u00d7` | × | 乘号 | `x` | +| `\u200b` | | Zero-Width Space | (删除) | + +--- + +### 2. 统计验证方法扩展 ✅ + +#### 2.1 T 检验验证增强 + +**改进点:** +- 智能样本量提取:支持 `(n=50)`、`n=50`、`(50例)` 等多种格式 +- 新增 `_extract_sample_sizes_from_header()` 和 `_extract_sample_sizes_from_row()` 方法 +- 支持括号格式的 SD:`45.2 (12.3)` +- 支持多行单元格 subrow 精确高亮 + +#### 2.2 SE 三角验证增强 + +**改进点:** +- 支持多行单元格的 subrow 精确定位 +- 遍历 P 值列每一行,分别验证 +- 显示友好的行描述(如变量名) + +#### 2.3 CI vs P 值一致性验证增强 + +**改进点:** +- 支持多行单元格 subrow 精确定位 +- 支持多个 CI/P 值对的验证 +- 使用 `_parse_pvalue_flexible` 灵活解析 + +--- + +### 3. 前端翻译映射更新 ✅ + +**文件:** `frontend-v2/src/modules/rvw/components/ForensicsReport.tsx` + +新增/完善的问题类型中文翻译: + +| 代码 | 中文描述 | +|------|----------| +| `ARITHMETIC_TOTAL` | 总计行错误 | +| `STAT_CI_PVALUE_CONFLICT` | CI 与 P 值矛盾 | +| `STAT_SD_GREATER_MEAN` | SD 大于均值 | +| `STAT_REGRESSION_CI_P` | 回归 CI-P 不一致 | +| `EXTRACTION_WARNING` | 提取警告 | +| `TABLE_SKIPPED` | 表格跳过 | + +--- + +### 4. 文件格式提示功能 ✅ + +**用户反馈:** 上传 PDF 文件后没有数据验证 Tab,需要提示用户 + +**实现内容:** + +| 文件 | 修改 | +|------|------| +| `Header.tsx` | 上传按钮下方添加蓝色提示框,推荐 .docx 格式 | +| `ReportDetail.tsx` | 非 docx 文件时显示黄色警告,解释为什么没有数据验证 | +| `TaskDetail.tsx` | 同上 | + +**提示内容:** +- **上传时:** "推荐上传 .docx 格式文件,可获得完整的数据验证功能。PDF 和 .doc 格式仅支持稿约和方法学评审。" +- **查看报告时:** "当前文件为 PDF/.doc 格式,无法进行数据验证。如需数据验证功能,请上传 .docx 格式文件。" + +--- + +## 📊 当前统计验证能力总览 + +| 验证类型 | 方法 | 状态 | +|----------|------|------| +| **L1 算术** | 百分比 n(%) | ✅ | +| **L1 算术** | Sum/Total 校验 | ✅ | +| **L2 统计** | 卡方检验 P 值逆向验证 | ✅ + subrow | +| **L2 统计** | T 检验 P 值逆向验证 | ✅ + subrow | +| **L2 统计** | CI vs P 值逻辑一致性 | ✅ + subrow | +| **L2.5 取证** | SE 三角验证 | ✅ + subrow | +| **L2.5 取证** | SD > Mean 检查 | ✅ | + +--- + +## 📁 修改的文件清单 + +### Python 后端 +- `extraction_service/forensics/extractor.py` - 负号归一化 +- `extraction_service/forensics/validator.py` - 统计验证扩展 + +### Node.js 后端 +- (无修改) + +### 前端 +- `frontend-v2/src/modules/rvw/components/ForensicsReport.tsx` - 翻译映射 +- `frontend-v2/src/modules/rvw/components/Header.tsx` - 上传提示 +- `frontend-v2/src/modules/rvw/components/ReportDetail.tsx` - 格式提示 +- `frontend-v2/src/modules/rvw/components/TaskDetail.tsx` - 格式提示 + +--- + +## 📋 待完成工作 + +### V2.0 MVP 剩余任务 + +| 任务 | 优先级 | 状态 | +|------|--------|------| +| Week 4 功能测试 | P0 | 📋 待开始 | +| Week 4 性能测试 | P1 | 📋 待开始 | +| Week 4 Bug 修复 | P0 | 📋 待开始 | +| Week 4 文档更新 | P1 | 📋 待开始 | + +### V2.1 待开发功能 + +| 功能 | 说明 | +|------|------| +| ANOVA 验证 | 多组比较 P 值验证 | +| 配对 T 检验 | 配对样本验证 | +| 非参数检验 | Mann-Whitney, Wilcoxon | +| .doc 格式支持 | 评估 Pandoc 替代方案 | +| Profile 管理 UI | 期刊配置界面 | + +--- + +## 💡 技术要点 + +### 负号归一化的重要性 + +```python +# 未清洗时 float() 会崩溃 +float('−1.5') # ValueError: could not convert string to float + +# 清洗后正常工作 +float('-1.5') # -1.5 +``` + +### Subrow 高亮原理 + +Word 表格中一个单元格可能包含多行数据(用换行符分隔),例如: + +``` +| 变量 | P值 | +|------|-----| +| 年龄 | 0.82 + 性别 0.01 <- 问题在这里 + BMI 0.95 | +``` + +通过 `data-subcoord="R2C2S2"` 属性可以精确定位到第 2 行第 2 列的第 2 个子行。 + +--- + +**文档版本:** v1.0 +**创建日期:** 2026-02-18 +**下次更新:** Week 4 测试完成后 diff --git a/docs/03-业务模块/RVW-稿件审查系统/06-开发记录/RVW V2.0 表格提取疑难杂症专项解决方案.md b/docs/03-业务模块/RVW-稿件审查系统/06-开发记录/RVW V2.0 表格提取疑难杂症专项解决方案.md new file mode 100644 index 00000000..26e35e71 --- /dev/null +++ b/docs/03-业务模块/RVW-稿件审查系统/06-开发记录/RVW V2.0 表格提取疑难杂症专项解决方案.md @@ -0,0 +1,137 @@ +# **RVW V2.0 表格提取疑难杂症专项解决方案** + +**问题焦点:** Word 表格“假行”现象(单元格内多段落)导致的提取错位 + +**核心策略:** 从“视觉模型”回归“DOM 深度解析” + +**技术栈:** Python (python-docx) + +## **1\. 核心判断:为什么不建议全量上视觉模型?** + +您提到用视觉模型(Vision Model,如 GPT-4V, Qwen-VL)来识别,这听起来很诱人(所见即所得),但在**数据侦探**场景下有致命缺陷: + +| 维度 | 视觉模型 (VLM/OCR) | 原生解析 (python-docx) | 结论 | +| :---- | :---- | :---- | :---- | +| **数值准确性** | **95%\~99%** (存在幻觉风险) | **100%** (直接读取 XML) | ❌ 审计场景不能有 1% 的误差 | +| **小数点敏感度** | 可能漏读小数点 (0.05 \-\> 005\) | 绝对精准 | ❌ P 值验证的核心 | +| **对齐能力** | 强 (能看懂视觉对齐) | 弱 (需算法辅助) | ✅ 视觉模型优势 | +| **成本/速度** | 高/慢 (需 GPU 推理) | 极低/极快 (CPU 解析) | ❌ 影响并发性能 | + +**决策:** + +**“数据”必须信赖 XML(代码),“结构”可以用算法还原。** 我们不需要视觉模型来看数字,我们只需要一段更聪明的 Python 代码来拆解段落。 + +## **2\. 现象诊断:什么是“隐性多行”?** + +在您的截图中,Word 表格的一行(Row)内部,用户使用了 **回车键 (Enter)** 或 **软回车 (Shift+Enter)** 进行了换行。 + +**python-docx 的默认行为:** + +cell.text 会把这些段落拼接成一个字符串,例如 "DNT时间段\\n\<45 min\\n45\~60 min"。前端 HTML 渲染时,如果没有处理 \\n,或者对应列的行数不匹配,就会导致错位。 + +## **3\. 解决方案:行分裂算法 (Row Explosion)** + +我们需要在提取阶段,检测这种情况,并将“逻辑上的一行”分裂成“视觉上的多行”。 + +### **3.1 算法逻辑** + +1. **扫描 (Scan)**:遍历表格的每一行。 +2. **检测 (Detect)**:检查该行每一列的 **段落数量 (Paragraph Count)**。 + * 例如:Col 1 有 4 个段落,Col 2 有 4 个段落,Col 3 只有 1 个段落(如 P 值)。 +3. **分裂 (Explode)**: + * 取最大段落数 max\_para (如 4)。 + * 如果 max\_para \> 1,则将此行**分裂**为 4 个新行。 +4. **填充 (Fill)**: + * 对于原本有多段落的列:按顺序填充到新行。 + * 对于只有 1 个段落的列(如 P 值 0.001): + * *策略 A(重复)*:每行都填 0.001。 + * *策略 B(首行/合并)*:只填第一行,后面留空(前端处理为合并单元格)。 + +### **3.2 代码实现 Demo** + +请让 Python 工程师在 DocxTableExtractor 中加入以下逻辑: + +from docx import Document +import pandas as pd + +def explode\_word\_table\_rows(table): + """ + 高级表格提取:处理单元格内的多段落(隐性多行) + """ + structured\_data \= \[\] + + for row in table.rows: + \# 1\. 获取该行每一列的段落内容列表 + \# cells\_content 结构: \[ \['DNT时间段', '\<45min', ...\], \['1299', '881', ...\], \['X2=..'\] \] + cells\_content \= \[\] + for cell in row.cells: + \# 过滤掉空段落,获取真实文本行 + paras \= \[p.text.strip() for p in cell.paragraphs if p.text.strip()\] + if not paras: + paras \= \[""\] \# 保持占位 + cells\_content.append(paras) + + \# 2\. 计算该行“分裂”的最大高度 + max\_height \= max(len(c) for c in cells\_content) + + \# 3\. 如果是标准单行,直接添加 + if max\_height \<= 1: + flat\_row \= \[c\[0\] if c else "" for c in cells\_content\] + structured\_data.append(flat\_row) + continue + + \# 4\. 执行分裂 (Row Explosion) + \# 针对每一层(visual\_row\_index),构建一行数据 + for i in range(max\_height): + new\_row \= \[\] + for col\_idx, cell\_paras in enumerate(cells\_content): + \# 策略:如何填充? + if len(cell\_paras) \> 1: + \# 情况 A:该列有多行,按顺序取 + \# 如果当前层级超过了该列的行数,填空(或填最后一行) + val \= cell\_paras\[i\] if i \< len(cell\_paras) else "" + else: + \# 情况 B:该列只有一行(通常是统计值 P值) + \# 只有第一行填值,模拟“合并单元格”的视觉效果 + \# 或者:val \= cell\_paras\[0\] (全部重复填充) \-\> 方便后续计算 + val \= cell\_paras\[0\] if i \== 0 else "" + + new\_row.append(val) + structured\_data.append(new\_row) + + return pd.DataFrame(structured\_data) + +\# 使用示例 +\# doc \= Document("sample.docx") +\# df \= explode\_word\_table\_rows(doc.tables\[0\]) +\# print(df) + +## **4\. 前端渲染的配合** + +为了让“数据侦探”的高亮定位准确,后端返回的数据结构必须包含**分裂后的坐标映射**。 + +**推荐的数据结构升级:** + +{ + "row\_id": "r4\_exploded\_0", // 原始第4行,分裂后的第0子行 + "is\_virtual": true, // 标记这是分裂出来的行 + "cells": \[ + { "text": "\<45 min", "source\_cell": "R4C1", "paragraph\_index": 1 }, + { "text": "881 (46.59)", "source\_cell": "R4C2", "paragraph\_index": 1 }, + { "text": "", "source\_cell": "R4C3", "is\_merged\_placeholder": true } // P值列留空 + \] +} + +**前端展示逻辑:** + +* 当后端返回 is\_merged\_placeholder: true 时,前端渲染时不显示内容,或者通过 CSS 渲染为合并单元格的样式(即不画上边框)。 + +## **5\. 总结** + +1. **别用视觉模型**:准确率风险太大,得不偿失。 +2. **用代码“分裂”段落**:Word 的 cell.paragraphs 是您的救星。 +3. **对齐策略**:通常临床表格中,如果一列有多行,另一列只有一行(如 P 值),那一行 P 值通常是对齐第一行或者居中的。在做\*\*数据验证(L1/L2)\*\*时,我们需要编写逻辑:*“如果检测到分裂行,且 P 值列为空,自动向上寻找最近的一个 P 值作为本行的验证依据。”* + +**实施建议:** + +请 Python 工程师立即测试上述 explode\_word\_table\_rows 逻辑。这能解决您 90% 的“HTML 只有一行”的问题。 \ No newline at end of file diff --git a/docs/03-业务模块/RVW-稿件审查系统/06-开发记录/临床统计特殊符号提取白皮书.md b/docs/03-业务模块/RVW-稿件审查系统/06-开发记录/临床统计特殊符号提取白皮书.md new file mode 100644 index 00000000..390dd0e8 --- /dev/null +++ b/docs/03-业务模块/RVW-稿件审查系统/06-开发记录/临床统计特殊符号提取白皮书.md @@ -0,0 +1,201 @@ +# **临床统计特殊符号提取白皮书** + +**用途:** 指导 Python (python-docx) 在提取 Word 表格时进行字符清洗和标准化。 + +**核心痛点:** 同一个数学含义,可能由多种不同的编码方式表示。 + +## **1\. 希腊字母类 (Greek Letters)** + +这是最容易出现乱码或识别错误的重灾区。 + +| + +| **符号** | **含义** | **常见 Unicode** | **Word 中的潜在坑 (Legacy Fonts)** | **处理建议** | + +| ![][image1] | **卡方检验** | \\u03c7 (χ) \+ \\u00b2 (²) | 1\. 字体设为 "Symbol" 的 'c' 2\. 公式编辑器对象 | **正则匹配**:\[\\u03c7\\u03a7\]2? **关键词**:chi-square, chi | + +| ![][image2] | 显著性水平 | \\u03b1 | 字体设为 "Symbol" 的 'a' | 替换为 alpha | + +| ![][image3] | 回归系数/功效 | \\u03b2 | 字体设为 "Symbol" 的 'b' | 替换为 beta | + +| ![][image4] | 总体均值 | \\u03bc | 字体设为 "Symbol" 的 'm' | 替换为 u 或 mean | + +| ![][image5] | 总体标准差 | \\u03c3 | 字体设为 "Symbol" 的 's' | 替换为 std | + +| ![][image6] | 变化量/差值 | \\u0394 (大写) | 字体设为 "Symbol" 的 'D' | 替换为 delta | + +| ![][image7] | 相关系数 | \\u03c1 | 字体设为 "Symbol" 的 'r' | 替换为 rho | + +**⚠️ 提取陷阱:** 很多老旧的 Word 文档(特别是中文期刊投稿)喜欢用 **Symbol 字体**。在 python-docx 提取 text 时,你可能会读到一个普通的英文字母 c,但用户看到的是 ![][image8]。 + +* **解决方案**:检查 run.font.name。如果字体是 Symbol,需要建立映射表(c \-\> χ, a \-\> α)。 + +## **2\. 数学运算符类 (Operators)** + +| **符号** | **含义** | **常见 Unicode** | **Word 变体** | **处理建议** | + +| ![][image9] | **加减/标准差** | \\u00b1 | \+/-, \+ / \- | 统一标准化为 \\u00b1 | + +| ![][image10] | 小于等于 | \\u2264 | \<=, \=\< | 统一为 \<= | + +| ![][image11] | 大于等于 | \\u2265 | \>= | 统一为 \>= | + +| ![][image12] | 不等于 | \\u2260 | \!=, \<\>, /= | 统一为 \!= | + +| ![][image13] | 约等于 | \\u2248 | \~, \= | 统一为 \~= | + +| ![][image14] | **负号/减号** | \\u2212 (Minus) | \\u002d (Hyphen), \\u2013 (En Dash) | **极高危!** 必须统一替换为标准连字符 \- (\\u002d),否则 float() 转换会报错 | + +| ![][image15] | 乘号/交互项 | \\u00d7 | x, X, \* | 统一为 x | + +**⚠️ 提取陷阱:** **“负号”是数据清洗中最大的坑**。Word 会自动把连字符(Hyphen)转成破折号(Dash)或数学减号(Minus)。 + +* python 代码:value.replace('\\u2212', '-').replace('\\u2013', '-') + +## **3\. 统计学专用标记 (Statistical Notations)** + +| **符号** | **含义** | **形式** | **提取难点** | + +| ![][image16] | **样本均值** | x 上加横线 | 通常是 **Word 公式对象 (OMML)** 或 **域代码 (EQ)**,python-docx 的 .text **读不出来横线**,只能读到 x。 | + +| ![][image17] | 样本率 | p 上加尖帽 | 同上。 | + +| ![][image18] | 决定系数 | R \+ 上标 2 | python-docx 默认读成 R2。**这通常可以接受**。 | + +| ![][image19] | 下标 (如 ![][image20]) | 文本 \+ 下标 | python-docx 默认读成 Xsub。需要识别 font.subscript 属性。 | + +**⚠️ 提取陷阱:** 对于 ![][image16] 这种带修饰符的字符,python-docx 可能只能提取到底座字符 x。 + +* **策略**:对于数据侦探来说,通常我们关注的是表头里的 Mean 或 Average 关键词,而不是符号。如果表头只有 ![][image16],可能需要结合上下文推断。 + +## **4\. 拉丁字母的特殊含义 (Latin Context)** + +虽然是普通字母,但在统计学上下文中具有特殊含义,通常以**斜体 (Italic)** 出现。 + +| **符号** | **含义** | **易混淆点** | + +| ![][image21] | t 检验统计量 | 容易混淆为时间单位 t (time) 或 吨 (ton) | + +| ![][image22] | F 检验统计量 | 女性 (Female) | + +| ![][image23] | Z 检验统计量 | \- | + +| ![][image24] | P 值 (概率) | 磷 (Phosphorus) | + +| ![][image25] | 样本量 | 牛顿 (Newton) | + +| ![][image26] | 相关系数 | 半径 (radius) | + +| ![][image27] | 回归系数 | \- | + +| ![][image28] | 优势比 | 手术室 (Operating Room), 或者 (or) | + +| ![][image29] | 风险比 | 心率 (Heart Rate) | + +| ![][image30] | 置信区间 | 心脏指数 (Cardiac Index) | + +**⚠️ 提取策略:** 不能只看字符,要看**组合**。 + +* P 单独出现且数值在 0-1 之间 \-\> P 值。 +* t 单独出现且数值 \> 0 \-\> t 值。 +* CI 后面跟着括号 (1.2-3.4) \-\> 置信区间。 + +## **5\. Python 字符串清洗工具箱 (Cleaner Utils)** + +建议在 DocxTableExtractor 中集成以下清洗函数: + +import re + +def clean\_statistical\_text(text): + if not text: + return "" + + \# 1\. 归一化负号 (CRITICAL) + text \= text.replace('\\u2212', '-').replace('\\u2013', '-').replace('\\u2014', '-') + + \# 2\. 归一化卡方 (Chi-square) + \# 处理 Symbol 字体的 'c'2 (需配合 run.font 检查,此处仅处理 Unicode) + text \= text.replace('\\u03c72', 'chi-square') + text \= text.replace('\\u03c7\\u00b2', 'chi-square') + text \= re.sub(r'\[Xxχ\]\\^?2', 'chi-square', text) \# 正则匹配常见变体 + + \# 3\. 归一化加减号 + text \= text.replace('\\u00b1', '+/-') + + \# 4\. 归一化比较符 + text \= text.replace('≤', '\<=').replace('≥', '\>=') + + \# 5\. 去除不可见字符 (Zero-width space 等) + text \= re.sub(r'\[\\u200b\\u200c\\u200d\\ufeff\]', '', text) + + return text.strip() + +## **6\. 总结** + +在 Word 提取中,最大的“鬼怪”不是复杂的 ![][image1],而是: + +1. **假的负号**(导致 float() 崩溃)。 +2. **Symbol 字体**(导致 ![][image2] 变成 a)。 +3. **多段落换行**(上一节已解决)。 + +只要处理好这三点,99% 的统计表格都能被正确解析。 + +[image1]: + +[image2]: + +[image3]: + +[image4]: + +[image5]: + +[image6]: + +[image7]: + +[image8]: + +[image9]: + +[image10]: + +[image11]: + +[image12]: + +[image13]: + +[image14]: + +[image15]: + +[image16]: + +[image17]: + +[image18]: + +[image19]: + +[image20]: + +[image21]: + +[image22]: + +[image23]: + +[image24]: + +[image25]: + +[image26]: + +[image27]: + +[image28]: + +[image29]: + +[image30]: \ No newline at end of file diff --git a/docs/03-业务模块/RVW-稿件审查系统/06-开发记录/务实版.md b/docs/03-业务模块/RVW-稿件审查系统/06-开发记录/务实版.md new file mode 100644 index 00000000..ea7b7a1a --- /dev/null +++ b/docs/03-业务模块/RVW-稿件审查系统/06-开发记录/务实版.md @@ -0,0 +1,149 @@ +# **RVW V2.0 表格提取疑难杂症专项解决方案 (v1.1 务实版)** + +**问题焦点:** Word 表格“隐性多行”(单元格内多段落)导致的提取与验证错位 **核心策略:** **提取层保持原貌,验证层“懒分裂” (Lazy Split)** **技术栈:** Python (python-docx, pandas) + +## **1\. 核心判断:技术选型定调** + +| 维度 | 方案 A: 视觉模型 (VLM) | 方案 B: 结构重组 (预分裂) | 方案 C: 懒分裂 (推荐) | +| :---- | :---- | :---- | :---- | +| **原理** | 用 GPT-4V 截图识别 | 提取时把 Table 拆成 N 倍行 | **提取保持 \\n,验证时 split** | +| **准确性** | 低 (幻觉/小数点风险) | 中 (容易破坏合并单元格结构) | **高 (数据无损,逻辑灵活)** | +| **复杂度** | 高 (GPU/Prompt) | 高 (重构 DataFrame 结构) | **低 (仅在 Validator 中处理)** | +| **前端适配** | 难 (无法定位) | 难 (需定制虚拟行渲染) | **易 (原生 HTML \)** | + +**最终决策:** + +1. **坚决不用视觉模型**:数值准确性是底线。 +2. **放弃“预分裂”**:不在提取阶段破坏表格的物理结构(Row/Span),避免引入元数据丢失风险。 +3. **采用“懒分裂”**:在验证逻辑中,针对特定单元格内容进行 split('\\n'),实现细粒度验证。 + +## **2\. 提取层规范 (Extractor Layer)** + +**目标**:忠实还原 Word 文档的物理结构,不自作聪明地拆行。 + +### **2.1 Python 实现逻辑** + +在 DocxTableExtractor 中,对于单元格内的多段落,直接使用换行符 \\n 连接。 + +def extract\_cell\_text(cell): + """ + 提取单元格文本,保留段落结构 + """ + \# 过滤掉完全空白的段落,保留有内容的段落 + paragraphs \= \[p.text.strip() for p in cell.paragraphs if p.text.strip()\] + return "\\n".join(paragraphs) + +**输出数据结构示例 (JSON)**: + +{ + "row\_index": 3, + "cells": \[ + { "text": "并发症\\n颅内出血\\n牙龈出血" }, // Col 0 + { "text": "277 (14.65)\\n85 (4.49)\\n94 (4.97)" }, // Col 1 + { "text": "χ²=5.687\\nχ²=0.003\\nχ²=13.745" }, // Col 3 (统计值) + { "text": "0.017\\n0.01\\n\<0.001" } // Col 4 (P值) + \] +} + +## **3\. 验证层规范 (Validator Layer)** + +**核心逻辑:** 验证器在读取数据时,动态检测是否存在多行内容。如果存在,则在内存中“临时分裂”并逐一验证。 + +### **3.1 懒分裂验证算法 (Lazy Verification Logic)** + +def verify\_row\_statistics(row\_data, col\_map): + """ + 验证单行数据的统计逻辑(支持隐性多行) + """ + issues \= \[\] + + \# 1\. 获取目标单元格的原始文本 + \# 假设我们要验证 Col 1 (Group A) vs Col 2 (Group B) \-\> P Value + cell\_a\_text \= row\_data\[col\_map\['group\_a'\]\] + cell\_b\_text \= row\_data\[col\_map\['group\_b'\]\] + cell\_p\_text \= row\_data\[col\_map\['p\_value'\]\] + + \# 2\. 懒分裂 (Lazy Split) + lines\_a \= cell\_a\_text.split('\\n') + lines\_b \= cell\_b\_text.split('\\n') + lines\_p \= cell\_p\_text.split('\\n') + + \# 3\. 确定对齐基准(取最大行数) + max\_lines \= max(len(lines\_a), len(lines\_b), len(lines\_p)) + + \# 4\. 逐行验证 (Line-by-Line Validation) + for i in range(max\_lines): + \# 安全获取当前行的数据(处理长度不一致情况) + val\_a \= lines\_a\[i\] if i \< len(lines\_a) else "" + val\_b \= lines\_b\[i\] if i \< len(lines\_b) else "" + + \# P 值匹配策略: + \# 如果 P 值列只有 1 行,但数据有 N 行 \-\> 广播机制 (Broadcast) + \# 如果 P 值列有 N 行 \-\> 一一对应 (One-to-One) + if len(lines\_p) \== 1 and max\_lines \> 1: + val\_p \= lines\_p\[0\] \# 策略 A: 共享 P 值 + else: + val\_p \= lines\_p\[i\] if i \< len(lines\_p) else "" \# 策略 B: 独立 P 值 + + \# 跳过空行 + if not val\_a or not val\_b or not val\_p: + continue + + \# 执行具体的统计验证 + \# 传入 line\_index=i 以便报错时定位 + error \= validate\_single\_line(val\_a, val\_b, val\_p, line\_index=i) + if error: + issues.append(error) + + return issues + +### **3.2 优势分析** + +1. **兼容性强**:完美支持您截图中的 颅内出血 | 85 | 90 | P=0.01 这种每行独立 P 值的场景。 +2. **鲁棒性**:如果只有第一行有 P 值(合并单元格视觉效果),代码中的 Broadcast 逻辑也能兜底。 +3. **定位精准**:报错信息可以包含 line\_index,告诉前端是单元格里的第几行出错了。 + +## **4\. 前端渲染规范 (Frontend Layer)** + +**目标**:使用最简单的 Web 技术还原 Word 样式,避免过度设计。 + +### **4.1 HTML 渲染策略** + +后端返回的 html 字段中,直接将 \\n 替换为 \。 + +**Python 端处理:** + +def generate\_html\_cell(text): + \# 转义 HTML 特殊字符,并将换行转为 \ + safe\_text \= html.escape(text) + return safe\_text.replace("\\n", "\") + +**前端展示效果:** + +\ + 277 (14.65)\ + 85 (4.49)\ + 94 (4.97) +\ + +### **4.2 错误高亮策略** + +由于我们不再拆分表格行(DOM 结构),高亮的最小单位是 **Cell(单元格)**。 + +* **交互设计**: + * 当发现第 2 行子数据错误时,**高亮整个单元格**。 + * **Tooltip 提示**:鼠标悬停时,显示具体错误信息:“第 2 行数据 P 值校验不通过”。 +* **进阶优化(V2.1 可选)**: + * 如果确实需要高亮某一行,Python 生成 HTML 时可以用 \ 包裹每一行: \277 (14.65)\\\85 (4.49)\ + * 但 MVP 阶段建议**只高亮单元格**,性价比最高。 + +## **5\. 总结** + +| 模块 | 核心动作 | 复杂度 | +| :---- | :---- | :---- | +| **Python 提取** | 保持 \\n,不拆行,输出标准 JSON | ⭐ (低) | +| **Python 验证** | split('\\n'),循环对齐,独立计算 | ⭐⭐ (中) | +| **前端渲染** | 使用 \ 换行,CSS 控制对齐 | ⭐ (低) | +| **前端高亮** | 高亮整个单元格,Tooltip 说明行号 | ⭐ (低) | + +**这是目前最务实、风险最低的实施路径。** 请开发团队以此为准。 \ No newline at end of file diff --git a/extraction_service/forensics/api.py b/extraction_service/forensics/api.py index 75a82e25..d2303a56 100644 --- a/extraction_service/forensics/api.py +++ b/extraction_service/forensics/api.py @@ -173,7 +173,7 @@ async def analyze_docx( f"耗时: {execution_time_ms}ms" ) - return JSONResponse(content=result.model_dump()) + return JSONResponse(content=result.model_dump(by_alias=True)) except HTTPException: raise diff --git a/extraction_service/forensics/config.py b/extraction_service/forensics/config.py index 7dace13c..d8ceec20 100644 --- a/extraction_service/forensics/config.py +++ b/extraction_service/forensics/config.py @@ -44,6 +44,12 @@ EFFECT_SIZE_PATTERN = re.compile( re.IGNORECASE ) +# 卡方值匹配,如 "χ²=57.519" 或 "2=57.519" 或 "χ2=57.519" +CHI_SQUARE_PATTERN = re.compile( + r"(?:χ[²2]|[χx]2|2)\s*[=:]\s*(\d+\.?\d*)", + re.IGNORECASE +) + # ==================== 统计方法检测 ==================== diff --git a/extraction_service/forensics/extractor.py b/extraction_service/forensics/extractor.py index 6b2a3fa9..9a981900 100644 --- a/extraction_service/forensics/extractor.py +++ b/extraction_service/forensics/extractor.py @@ -225,8 +225,8 @@ class DocxTableExtractor: if col_idx >= num_cols: break - # 获取单元格文本 - cell_text = self._get_cell_text(cell) + # 获取单元格文本(保留换行符用于 HTML 显示) + cell_text = self._get_cell_text(cell, use_newline=True) # 检测合并范围 # python-docx 中合并单元格会重复出现同一个 cell 对象 @@ -253,13 +253,123 @@ class DocxTableExtractor: return data - def _get_cell_text(self, cell: _Cell) -> str: + # Symbol 字体字符映射表(Word 使用 Symbol 字体表示希腊字母等) + SYMBOL_CHAR_MAP = { + 'F063': 'χ', # chi + 'F032': '²', # superscript 2 + 'F061': 'α', # alpha + 'F062': 'β', # beta + 'F067': 'γ', # gamma + 'F064': 'δ', # delta + 'F065': 'ε', # epsilon + 'F06D': 'μ', # mu + 'F073': 'σ', # sigma + 'F070': 'π', # pi + 'F0B2': '²', # another superscript 2 encoding + } + + def _clean_statistical_text(self, text: str) -> str: + """ + 清洗统计学文本中的特殊字符 + + 关键清洗: + 1. 负号归一化(最重要!防止 float() 崩溃) + 2. 比较符归一化 + 3. 零宽字符清理 + """ + if not text: + return "" + + # 1. 负号归一化(极高危!) + # Word 会自动把连字符转成破折号或数学减号,导致 float() 报错 + text = text.replace('\u2212', '-') # 数学减号 (Minus Sign) + text = text.replace('\u2013', '-') # En Dash + text = text.replace('\u2014', '-') # Em Dash + text = text.replace('\u2010', '-') # Hyphen + text = text.replace('\u2011', '-') # Non-Breaking Hyphen + text = text.replace('\u00ad', '-') # Soft Hyphen + + # 2. 比较符归一化 + text = text.replace('\u2264', '<=') # ≤ + text = text.replace('\u2265', '>=') # ≥ + text = text.replace('\u2260', '!=') # ≠ + text = text.replace('\u2248', '~=') # ≈ + + # 3. 加减号归一化 + # 保留 ± 原样,因为它在统计学中有特定含义(如 mean±SD) + # text = text.replace('\u00b1', '+/-') # ± + + # 4. 乘号归一化 + text = text.replace('\u00d7', 'x') # × + text = text.replace('\u2217', '*') # ∗ (asterisk operator) + + # 5. 零宽字符清理 + text = text.replace('\u200b', '') # Zero-Width Space + text = text.replace('\u200c', '') # Zero-Width Non-Joiner + text = text.replace('\u200d', '') # Zero-Width Joiner + text = text.replace('\ufeff', '') # BOM / Zero-Width No-Break Space + text = text.replace('\u00a0', ' ') # Non-Breaking Space -> 普通空格 + + return text + + def _get_cell_text(self, cell: _Cell, use_newline: bool = False) -> str: """ 获取单元格文本(合并多个段落) + + Args: + cell: Word 单元格对象 + use_newline: 是否使用换行符连接段落(用于 HTML 显示) + + 注意:会处理 Word 的 符号字符(如 χ² 等) """ paragraphs = cell.paragraphs - texts = [p.text.strip() for p in paragraphs] - return " ".join(texts).strip() + texts = [] + + for para in paragraphs: + # 使用增强的文本提取(处理符号字符) + para_text = self._extract_paragraph_text(para) + if para_text.strip(): + texts.append(para_text.strip()) + + separator = "\n" if use_newline else " " + raw_text = separator.join(texts).strip() + + # 清洗统计学特殊字符(负号归一化等) + return self._clean_statistical_text(raw_text) + + def _extract_paragraph_text(self, para: Paragraph) -> str: + """ + 从段落中提取完整文本,包括 符号字符 + + Word 使用 表示 χ 等符号, + python-docx 的 paragraph.text 不会提取这些内容。 + """ + from docx.oxml.ns import qn + + text_parts = [] + + # 遍历段落中的所有 run 元素 + for run in para._p.iter(): + # 处理普通文本 + if run.tag == qn('w:t'): + text_parts.append(run.text or '') + + # 处理符号字符 + elif run.tag == qn('w:sym'): + font = run.get(qn('w:font')) + char_code = run.get(qn('w:char')) + + if font == 'Symbol' and char_code: + # 查找映射 + unicode_char = self.SYMBOL_CHAR_MAP.get(char_code.upper(), '') + if unicode_char: + text_parts.append(unicode_char) + else: + # 未知符号,记录警告 + logger.debug(f"Unknown Symbol char: {char_code}") + text_parts.append(f'[SYM:{char_code}]') + + return ''.join(text_parts) def _generate_html( self, @@ -296,8 +406,10 @@ class DocxTableExtractor: html_parts.append(" ") for col_idx, cell in enumerate(row, start=1): coord = f"R{row_idx}C{col_idx}" + # 为每个子行添加 span 标记,支持细粒度高亮 + cell_html = self._escape_html_with_subrows(cell, coord) html_parts.append( - f' {self._escape_html(cell)}' + f' {cell_html}' ) html_parts.append(" ") html_parts.append(" ") @@ -307,7 +419,43 @@ class DocxTableExtractor: return "\n".join(html_parts) def _escape_html(self, text: str) -> str: - """转义 HTML 特殊字符""" + """转义 HTML 特殊字符,并将换行符转换为
""" + escaped = ( + text + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + .replace("'", "'") + ) + # 将换行符转换为
标签,保留表格中的多行结构 + return escaped.replace("\n", "
") + + def _escape_html_with_subrows(self, text: str, coord: str) -> str: + """ + 转义 HTML 并为每个子行添加 span 标记,支持细粒度高亮 + + 例如:单元格内容 "0.017\n0.01\n<0.001" 会生成: + 0.017
+ 0.01
+ <0.001 + """ + lines = text.split("\n") + if len(lines) == 1: + # 单行内容,直接转义 + return self._escape_single(text) + + # 多行内容,为每行添加 span + result_parts = [] + for idx, line in enumerate(lines, start=1): + escaped_line = self._escape_single(line) + subcoord = f"{coord}S{idx}" + result_parts.append(f'{escaped_line}') + + return "
".join(result_parts) + + def _escape_single(self, text: str) -> str: + """转义单行文本的 HTML 特殊字符""" return ( text .replace("&", "&") diff --git a/extraction_service/forensics/types.py b/extraction_service/forensics/types.py index 1df79165..71fafde5 100644 --- a/extraction_service/forensics/types.py +++ b/extraction_service/forensics/types.py @@ -4,7 +4,7 @@ 定义所有数据结构,确保类型安全和接口一致性。 """ -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_serializer from typing import List, Dict, Any, Optional from enum import Enum @@ -59,15 +59,30 @@ class ForensicsConfig(BaseModel): class CellLocation(BaseModel): - """单元格位置(R1C1 坐标)""" - table_id: str = Field(..., description="表格 ID,如 tbl_0") + """单元格位置(R1C1 坐标),支持单元格内子行定位""" + table_id: str = Field(..., alias="tableId", description="表格 ID,如 tbl_0") row: int = Field(..., description="行号,从 1 开始") col: int = Field(..., description="列号,从 1 开始") + subrow: Optional[int] = Field(None, description="单元格内子行号,从 1 开始(用于多行单元格)") - @property - def cell_ref(self) -> str: - """返回 R1C1 格式的坐标""" - return f"R{self.row}C{self.col}" + model_config = {"populate_by_name": True} + + @model_serializer + def serialize(self) -> Dict[str, Any]: + """序列化时自动添加 cellRef 字段,支持子行坐标""" + # 基础坐标:R{row}C{col} + # 子行坐标:R{row}C{col}S{subrow} + cell_ref = f"R{self.row}C{self.col}" + if self.subrow is not None: + cell_ref += f"S{self.subrow}" + + return { + "tableId": self.table_id, + "row": self.row, + "col": self.col, + "subrow": self.subrow, + "cellRef": cell_ref + } class Issue(BaseModel): @@ -84,26 +99,30 @@ class TableData(BaseModel): id: str = Field(..., description="表格 ID,如 tbl_0") caption: Optional[str] = Field(None, description="表格标题") type: Optional[str] = Field(None, description="表格类型:BASELINE/OUTCOME/OTHER") - row_count: int = Field(..., description="行数") - col_count: int = Field(..., description="列数") + row_count: int = Field(..., alias="rowCount", description="行数") + col_count: int = Field(..., alias="colCount", description="列数") html: str = Field(..., description="预渲染的 HTML 片段") data: List[List[str]] = Field(..., description="二维数组数据") issues: List[Issue] = Field(default_factory=list, description="该表格的问题列表") skipped: bool = Field(default=False, description="是否被跳过(超限)") - skip_reason: Optional[str] = Field(None, description="跳过原因") + skip_reason: Optional[str] = Field(None, alias="skipReason", description="跳过原因") + + model_config = {"populate_by_name": True} class ForensicsResult(BaseModel): """数据侦探分析结果""" success: bool = Field(..., description="是否成功") - methods_found: List[str] = Field(default_factory=list, description="检测到的统计方法") + methods_found: List[str] = Field(default_factory=list, alias="methodsFound", description="检测到的统计方法") tables: List[TableData] = Field(default_factory=list, description="表格列表") - total_issues: int = Field(default=0, description="总问题数") - error_count: int = Field(default=0, description="ERROR 级别问题数") - warning_count: int = Field(default=0, description="WARNING 级别问题数") - execution_time_ms: int = Field(default=0, description="执行时间(毫秒)") + total_issues: int = Field(default=0, alias="totalIssues", description="总问题数") + error_count: int = Field(default=0, alias="errorCount", description="ERROR 级别问题数") + warning_count: int = Field(default=0, alias="warningCount", description="WARNING 级别问题数") + execution_time_ms: int = Field(default=0, alias="executionTimeMs", description="执行时间(毫秒)") error: Optional[str] = Field(None, description="错误信息(如果失败)") - fallback_available: bool = Field(default=True, description="是否可降级执行") + fallback_available: bool = Field(default=True, alias="fallbackAvailable", description="是否可降级执行") + + model_config = {"populate_by_name": True} class ExtractionError(Exception): diff --git a/extraction_service/forensics/validator.py b/extraction_service/forensics/validator.py index 4635a080..27bda667 100644 --- a/extraction_service/forensics/validator.py +++ b/extraction_service/forensics/validator.py @@ -47,6 +47,7 @@ from .config import ( MEAN_SD_PAREN_PATTERN, CI_PATTERNS, EFFECT_SIZE_PATTERN, + CHI_SQUARE_PATTERN, DEFAULT_TOLERANCE_PERCENT, PVALUE_ERROR_THRESHOLD, PVALUE_WARNING_THRESHOLD, @@ -54,6 +55,43 @@ from .config import ( ) +def _clean_number_string(text: str) -> str: + """ + 清洗数值字符串中的特殊字符,防止 float() 崩溃 + + 关键清洗:负号归一化(Word 会把 - 转成数学减号或破折号) + """ + if not text: + return "" + + # 负号归一化(防止 float() 崩溃) + text = text.replace('\u2212', '-') # 数学减号 (Minus Sign) + text = text.replace('\u2013', '-') # En Dash + text = text.replace('\u2014', '-') # Em Dash + text = text.replace('\u2010', '-') # Hyphen + text = text.replace('\u2011', '-') # Non-Breaking Hyphen + + # 零宽字符清理 + text = text.replace('\u200b', '') # Zero-Width Space + text = text.replace('\u00a0', ' ') # Non-Breaking Space -> 普通空格 + + return text.strip() + + +def _safe_float(text: str) -> Optional[float]: + """ + 安全的 float 转换,处理特殊字符 + + Returns: + 转换成功返回浮点数,失败返回 None + """ + try: + cleaned = _clean_number_string(text) + return float(cleaned) + except (ValueError, TypeError): + return None + + class ArithmeticValidator: """ L1 算术自洽性验证器 @@ -214,20 +252,21 @@ class ArithmeticValidator: - 纯数字 "45" - 带逗号 "1,234" - 带空格 "1 234" + - 负数(含特殊负号字符) """ if not text: return None - # 移除常见分隔符 - cleaned = text.strip().replace(",", "").replace(" ", "") + # 先清洗特殊字符(负号归一化等) + cleaned = _clean_number_string(text) - # 尝试提取第一个数字 - match = re.match(r"^(\d+(?:\.\d+)?)", cleaned) + # 移除常见分隔符 + cleaned = cleaned.replace(",", "").replace(" ", "") + + # 尝试提取数字(支持负数) + match = re.match(r"^(-?\d+(?:\.\d+)?)", cleaned) if match: - try: - return float(match.group(1)) - except ValueError: - return None + return _safe_float(match.group(1)) return None @@ -340,6 +379,11 @@ class StatValidator: ttest_issues = self._validate_ttest(table) issues.extend(ttest_issues) + # 2.5. 卡方检验逆向验证 + if SCIPY_AVAILABLE: + chi2_issues = self._validate_chi_square(table) + issues.extend(chi2_issues) + # 3. SE 三角验证(终审提权:回归系数 CI↔P 一致性) se_issues = self._validate_se_triangle(table) issues.extend(se_issues) @@ -364,68 +408,106 @@ class StatValidator: - 若 95% CI 不跨越 1.0(如 1.1-1.5)→ P 值必须 < 0.05 违反此规则 = 数据逻辑矛盾 + + 改进:支持多行单元格的 subrow 精确定位 """ issues: List[Issue] = [] data = table.data + if len(data) < 2: + return issues + + header = data[0] if data else [] + pvalue_col_idx = self._find_pvalue_column(header) + for row_idx, row in enumerate(data[1:], start=2): + # 获取 P 值列内容(可能有多行) + pvalue_cell = row[pvalue_col_idx] if pvalue_col_idx < len(row) else "" + pvalue_lines = pvalue_cell.split("\n") if pvalue_cell else [] + + # 获取第一列内容(用于描述) + first_cell_lines = row[0].split("\n") if row else [] + + # 整行文本用于查找 CI row_text = " ".join(row) - # 查找 CI(使用增强的 CI 解析) - ci_result = self._parse_ci(row_text) - if ci_result is None: + # 查找所有 CI + all_ci_results = [] + for pattern in CI_PATTERNS: + for match in pattern.finditer(row_text): + ci_lower = _safe_float(match.group(1)) + ci_upper = _safe_float(match.group(2)) + if ci_lower is not None and ci_upper is not None and ci_lower < ci_upper: + all_ci_results.append((ci_lower, ci_upper)) + + if not all_ci_results: + # 回退到单个 CI 解析 + ci_result = self._parse_ci(row_text) + if ci_result: + all_ci_results.append(ci_result) + + if not all_ci_results: continue - ci_lower, ci_upper = ci_result - - # 查找 P 值 - pvalue = self._parse_pvalue(row_text) - if pvalue is None: - continue - - # 检查逻辑一致性 - ci_crosses_one = ci_lower <= 1.0 <= ci_upper - p_significant = pvalue < 0.05 - - # 矛盾情况 - if ci_crosses_one and p_significant: - # CI 跨越 1 但 P < 0.05,矛盾 - issues.append(Issue( - severity=Severity.ERROR, - type=IssueType.STAT_CI_PVALUE_CONFLICT, - message=f"CI 与 P 值逻辑矛盾: 95% CI ({ci_lower}-{ci_upper}) 跨越 1.0,但 P={pvalue} < 0.05", - location=CellLocation( - table_id=table.id, - row=row_idx, - col=1 # 整行问题 - ), - evidence={ - "ci_lower": ci_lower, - "ci_upper": ci_upper, - "ci_crosses_one": ci_crosses_one, - "pvalue": pvalue, - "p_significant": p_significant - } - )) - elif not ci_crosses_one and not p_significant: - # CI 不跨越 1 但 P ≥ 0.05,矛盾 - issues.append(Issue( - severity=Severity.ERROR, - type=IssueType.STAT_CI_PVALUE_CONFLICT, - message=f"CI 与 P 值逻辑矛盾: 95% CI ({ci_lower}-{ci_upper}) 不跨越 1.0,但 P={pvalue} ≥ 0.05", - location=CellLocation( - table_id=table.id, - row=row_idx, - col=1 - ), - evidence={ - "ci_lower": ci_lower, - "ci_upper": ci_upper, - "ci_crosses_one": ci_crosses_one, - "pvalue": pvalue, - "p_significant": p_significant - } - )) + # 遍历 P 值行进行验证 + for line_idx, pvalue_line in enumerate(pvalue_lines): + pvalue = self._parse_pvalue_flexible(pvalue_line) + if pvalue is None: + continue + + # 获取行描述 + row_desc = first_cell_lines[line_idx] if line_idx < len(first_cell_lines) else f"第{line_idx+1}项" + + # 使用对应的 CI(如果有多个 CI,按顺序匹配) + ci_idx = min(line_idx, len(all_ci_results) - 1) + ci_lower, ci_upper = all_ci_results[ci_idx] + + # 检查逻辑一致性 + ci_crosses_one = ci_lower <= 1.0 <= ci_upper + p_significant = pvalue < 0.05 + + # 计算 subrow 索引 + subrow_idx = line_idx + 1 if len(pvalue_lines) > 1 else None + + # 矛盾情况 + if ci_crosses_one and p_significant: + issues.append(Issue( + severity=Severity.ERROR, + type=IssueType.STAT_CI_PVALUE_CONFLICT, + message=f"CI 与 P 值逻辑矛盾 [{row_desc.strip()}]: 95% CI ({ci_lower}-{ci_upper}) 跨越 1.0,但 P={pvalue} < 0.05", + location=CellLocation( + table_id=table.id, + row=row_idx, + col=pvalue_col_idx + 1, + subrow=subrow_idx + ), + evidence={ + "ci_lower": ci_lower, + "ci_upper": ci_upper, + "ci_crosses_one": ci_crosses_one, + "pvalue": pvalue, + "p_significant": p_significant + } + )) + elif not ci_crosses_one and not p_significant: + issues.append(Issue( + severity=Severity.ERROR, + type=IssueType.STAT_CI_PVALUE_CONFLICT, + message=f"CI 与 P 值逻辑矛盾 [{row_desc.strip()}]: 95% CI ({ci_lower}-{ci_upper}) 不跨越 1.0,但 P={pvalue} ≥ 0.05", + location=CellLocation( + table_id=table.id, + row=row_idx, + col=pvalue_col_idx + 1, + subrow=subrow_idx + ), + evidence={ + "ci_lower": ci_lower, + "ci_upper": ci_upper, + "ci_crosses_one": ci_crosses_one, + "pvalue": pvalue, + "p_significant": p_significant + } + )) return issues @@ -437,6 +519,11 @@ class StatValidator: 与报告的 P 值进行对比。 公式: t = (M1 - M2) / sqrt(SD1²/n1 + SD2²/n2) + + 改进: + 1. 智能样本量提取(表头、行首、上下文) + 2. 支持多种 Mean±SD 格式 + 3. 支持多行单元格的 subrow 精确定位 """ issues: List[Issue] = [] @@ -447,90 +534,433 @@ class StatValidator: if len(data) < 2: return issues + header = data[0] if data else [] + + # 预先从表头提取样本量 + n1, n2 = self._extract_sample_sizes_from_header(header) + + # 查找 P 值列的索引 + pvalue_col_idx = self._find_pvalue_column(header) + # 查找包含组比较数据的行 for row_idx, row in enumerate(data[1:], start=2): - # 尝试提取同一行中的两组数据 - mean_sd_matches = list(MEAN_SD_PATTERN.finditer(" ".join(row))) + row_text = " ".join(row) - if len(mean_sd_matches) >= 2: - # 找到至少两组 Mean±SD 数据 - try: - m1, sd1 = float(mean_sd_matches[0].group(1)), float(mean_sd_matches[0].group(2)) - m2, sd2 = float(mean_sd_matches[1].group(1)), float(mean_sd_matches[1].group(2)) - - # 提取 P 值 - row_text = " ".join(row) - pvalue = self._parse_pvalue(row_text) - + # 尝试提取同一行中的两组 Mean±SD 数据 + mean_sd_matches = list(MEAN_SD_PATTERN.finditer(row_text)) + + # 如果没找到,尝试括号格式 + if len(mean_sd_matches) < 2: + mean_sd_matches = list(MEAN_SD_PAREN_PATTERN.finditer(row_text)) + + if len(mean_sd_matches) < 2: + continue + + # 找到至少两组 Mean±SD 数据 + try: + m1 = _safe_float(mean_sd_matches[0].group(1)) + sd1 = _safe_float(mean_sd_matches[0].group(2)) + m2 = _safe_float(mean_sd_matches[1].group(1)) + sd2 = _safe_float(mean_sd_matches[1].group(2)) + + if None in (m1, sd1, m2, sd2): + continue + + # 如果表头没有样本量,尝试从行中提取 + local_n1, local_n2 = n1, n2 + if local_n1 is None or local_n2 is None: + local_n1, local_n2 = self._extract_sample_sizes_from_row(row, header) + + # 仍然没有样本量,跳过 + if local_n1 is None or local_n2 is None: + continue + + # 计算 SE 和 t 值 + se = math.sqrt(sd1**2/local_n1 + sd2**2/local_n2) + if se == 0: + continue + + t_calc = abs(m1 - m2) / se + df = local_n1 + local_n2 - 2 + + # 计算 P 值 + p_calc = 2 * (1 - stats.t.cdf(t_calc, df)) + + # 从 P 值列提取报告的 P 值 + pvalue_cell = row[pvalue_col_idx] if pvalue_col_idx < len(row) else "" + pvalue_lines = pvalue_cell.split("\n") if pvalue_cell else [] + + # 尝试从整行提取 P 值(如果 P 值列没有) + if not pvalue_lines or not any(pvalue_lines): + pvalue = self._parse_pvalue_flexible(row_text) if pvalue is None: continue + pvalue_lines = [str(pvalue)] + subrow_idx = None + pvalue_col = pvalue_col_idx + 1 + else: + # 遍历 P 值单元格的每一行 + for line_idx, pvalue_line in enumerate(pvalue_lines): + pvalue = self._parse_pvalue_flexible(pvalue_line) + if pvalue is None: + continue + + # 计算子行索引 + subrow_idx = line_idx + 1 if len(pvalue_lines) > 1 else None + pvalue_col = pvalue_col_idx + 1 + + # 比较 P 值 + p_diff = abs(p_calc - pvalue) + + # 获取行描述 + first_cell_lines = row[0].split("\n") if row else [] + row_desc = first_cell_lines[line_idx] if line_idx < len(first_cell_lines) else row[0][:20] if row else "" + + if p_diff > PVALUE_ERROR_THRESHOLD: + issues.append(Issue( + severity=Severity.ERROR, + type=IssueType.STAT_TTEST_PVALUE, + message=f"T 检验 P 值矛盾 [{row_desc.strip()}]: 报告 P={pvalue},计算 P={p_calc:.4f}(差异 {p_diff:.3f})", + location=CellLocation( + table_id=table.id, + row=row_idx, + col=pvalue_col, + subrow=subrow_idx + ), + evidence={ + "group1": {"mean": m1, "sd": sd1, "n": local_n1}, + "group2": {"mean": m2, "sd": sd2, "n": local_n2}, + "t_calculated": round(t_calc, 3), + "df": df, + "p_calculated": round(p_calc, 4), + "p_reported": pvalue, + "p_difference": round(p_diff, 4) + } + )) + elif p_diff > PVALUE_WARNING_THRESHOLD: + issues.append(Issue( + severity=Severity.WARNING, + type=IssueType.STAT_TTEST_PVALUE, + message=f"T 检验 P 值轻微偏差 [{row_desc.strip()}]: 报告 P={pvalue},计算 P={p_calc:.4f}", + location=CellLocation( + table_id=table.id, + row=row_idx, + col=pvalue_col, + subrow=subrow_idx + ), + evidence={ + "p_calculated": round(p_calc, 4), + "p_reported": pvalue, + "p_difference": round(p_diff, 4) + } + )) + continue # 已处理完此行的所有 P 值 + + # 单个 P 值的情况 + pvalue = self._parse_pvalue_flexible(pvalue_lines[0]) if pvalue_lines else None + if pvalue is None: + continue - # 尝试从表头获取样本量(简化处理,假设 n=30) - # 实际实现需要更复杂的表格解析 - n1, n2 = self._estimate_sample_sizes(table, row_idx) - - if n1 is None or n2 is None: - continue - - # 计算 t 值 - se = math.sqrt(sd1**2/n1 + sd2**2/n2) - if se == 0: - continue - - t_calc = abs(m1 - m2) / se - df = n1 + n2 - 2 - - # 计算 P 值 - p_calc = 2 * (1 - stats.t.cdf(t_calc, df)) + p_diff = abs(p_calc - pvalue) + + if p_diff > PVALUE_ERROR_THRESHOLD: + issues.append(Issue( + severity=Severity.ERROR, + type=IssueType.STAT_TTEST_PVALUE, + message=f"T 检验 P 值不一致: 报告 P={pvalue},计算 P={p_calc:.4f}(差异 {p_diff:.3f})", + location=CellLocation( + table_id=table.id, + row=row_idx, + col=pvalue_col_idx + 1 + ), + evidence={ + "group1": {"mean": m1, "sd": sd1, "n": local_n1}, + "group2": {"mean": m2, "sd": sd2, "n": local_n2}, + "t_calculated": round(t_calc, 3), + "df": df, + "p_calculated": round(p_calc, 4), + "p_reported": pvalue, + "p_difference": round(p_diff, 4) + } + )) + elif p_diff > PVALUE_WARNING_THRESHOLD: + issues.append(Issue( + severity=Severity.WARNING, + type=IssueType.STAT_TTEST_PVALUE, + message=f"T 检验 P 值轻微偏差: 报告 P={pvalue},计算 P={p_calc:.4f}", + location=CellLocation( + table_id=table.id, + row=row_idx, + col=pvalue_col_idx + 1 + ), + evidence={ + "p_calculated": round(p_calc, 4), + "p_reported": pvalue, + "p_difference": round(p_diff, 4) + } + )) + + except (ValueError, TypeError, ZeroDivisionError) as e: + logger.debug(f"T 检验验证失败: {e}") + continue + + return issues + + def _extract_sample_sizes_from_header(self, header: List[str]) -> Tuple[Optional[int], Optional[int]]: + """ + 从表头提取样本量 + + 支持格式: + - (n=50) + - n=50 + - N=50 + - (50例) + - 对照组(n=48) + """ + n_pattern = re.compile(r"[(\[(]?\s*[nN]\s*[=::]\s*(\d+)\s*[)\])]?") + n_pattern_cn = re.compile(r"[(\[(]?\s*(\d+)\s*例\s*[)\])]?") + + n_values = [] + for cell in header: + # 优先匹配 n=XX 格式 + match = n_pattern.search(cell) + if match: + try: + n_values.append(int(match.group(1))) + except ValueError: + pass + continue + + # 尝试中文格式 + match = n_pattern_cn.search(cell) + if match: + try: + n_values.append(int(match.group(1))) + except ValueError: + pass + + if len(n_values) >= 2: + return n_values[0], n_values[1] + + return None, None + + def _extract_sample_sizes_from_row( + self, + row: List[str], + header: List[str] + ) -> Tuple[Optional[int], Optional[int]]: + """ + 从数据行提取样本量 + + 策略: + 1. 查找行首的 n 值 + 2. 查找与 Mean±SD 列对应的 n 列 + """ + row_text = " ".join(row) + n_pattern = re.compile(r"\(\s*[nN]\s*[=::]\s*(\d+)\s*\)") + + matches = n_pattern.findall(row_text) + if len(matches) >= 2: + try: + return int(matches[0]), int(matches[1]) + except ValueError: + pass + + return None, None + + def _validate_chi_square(self, table: TableData) -> List[Issue]: + """ + 卡方检验逆向验证 + + 从报告的 χ² 值和推断的自由度,反算 P 值,与报告值对比。 + + 原理: + - 查找 χ²=X.XXX 和对应的 P 值 + - 估计自由度(默认 df=1,适用于大多数 2x2 比较) + - 使用卡方分布计算 P 值 + - 与报告的 P 值对比 + + 特殊处理: + - 支持多段落单元格(一个单元格内多行数据) + - 支持 P 值列没有 "P=" 前缀的情况(直接是数值) + + 适用场景: + - 医学基线特征表(分类变量比较) + - 任何报告 χ² 值和 P 值的表格 + """ + issues: List[Issue] = [] + + if not SCIPY_AVAILABLE: + return issues + + data = table.data + if len(data) < 2: + return issues + + # 首先识别表头,找到 P 值列 + header = data[0] + pvalue_col_idx = self._find_pvalue_column(header) + chi2_col_idx = self._find_stat_column(header) + + for row_idx, row in enumerate(data[1:], start=2): + # 获取统计值和 P 值单元格 + stat_cell = row[chi2_col_idx] if chi2_col_idx < len(row) else "" + pvalue_cell = row[pvalue_col_idx] if pvalue_col_idx < len(row) else "" + + # 处理多行单元格:按换行符分割 + stat_lines = stat_cell.split("\n") if stat_cell else [] + pvalue_lines = pvalue_cell.split("\n") if pvalue_cell else [] + + # 逐行匹配卡方值和 P 值 + for line_idx in range(max(len(stat_lines), len(pvalue_lines))): + stat_line = stat_lines[line_idx] if line_idx < len(stat_lines) else "" + pvalue_line = pvalue_lines[line_idx] if line_idx < len(pvalue_lines) else "" + + # 查找 χ² 值 + chi2_match = CHI_SQUARE_PATTERN.search(stat_line) + if not chi2_match: + continue + + chi2_value = _safe_float(chi2_match.group(1)) + if chi2_value is None or chi2_value <= 0: + continue + + # 解析 P 值(支持多种格式) + pvalue = self._parse_pvalue_flexible(pvalue_line) + if pvalue is None: + continue + + # 默认 df=1(最常见的 2x2 比较场景) + df = 1 + + try: + # 使用卡方分布计算 P 值 + p_calc = 1 - stats.chi2.cdf(chi2_value, df) # 比较 P 值 p_diff = abs(p_calc - pvalue) - if p_diff > PVALUE_ERROR_THRESHOLD: - # 严重矛盾 + # 检查显著性是否一致 + p_significant_reported = pvalue < 0.05 + p_significant_calc = p_calc < 0.05 + significance_mismatch = p_significant_reported != p_significant_calc + + # 获取子行描述(从第一列提取) + first_cell_lines = row[0].split("\n") if row else [] + sub_row_desc = first_cell_lines[line_idx] if line_idx < len(first_cell_lines) else f"子行 {line_idx + 1}" + + # 计算子行索引(从 1 开始),用于前端精确高亮 + subrow_idx = line_idx + 1 if len(pvalue_lines) > 1 else None + + if significance_mismatch: issues.append(Issue( severity=Severity.ERROR, - type=IssueType.STAT_TTEST_PVALUE, - message=f"T 检验 P 值不一致: 报告 P={pvalue},计算 P={p_calc:.4f}(差异 {p_diff:.3f})", + type=IssueType.STAT_CHI2_PVALUE, + message=f"卡方检验 P 值矛盾 [{sub_row_desc.strip()}]: χ²={chi2_value}, 报告 P={pvalue}, 计算 P={p_calc:.4f},显著性不一致", location=CellLocation( table_id=table.id, row=row_idx, - col=1 + col=pvalue_col_idx + 1, + subrow=subrow_idx ), evidence={ - "group1": {"mean": m1, "sd": sd1, "n": n1}, - "group2": {"mean": m2, "sd": sd2, "n": n2}, - "t_calculated": round(t_calc, 3), + "chi2_value": chi2_value, "df": df, "p_calculated": round(p_calc, 4), "p_reported": pvalue, - "p_difference": round(p_diff, 4) + "p_difference": round(p_diff, 4), + "sub_row": sub_row_desc.strip(), + "significance_reported": "显著" if p_significant_reported else "不显著", + "significance_calculated": "显著" if p_significant_calc else "不显著" } )) - elif p_diff > PVALUE_WARNING_THRESHOLD: - # 可能是舍入误差 + elif p_diff > PVALUE_ERROR_THRESHOLD: issues.append(Issue( severity=Severity.WARNING, - type=IssueType.STAT_TTEST_PVALUE, - message=f"T 检验 P 值轻微偏差: 报告 P={pvalue},计算 P={p_calc:.4f}(可能是舍入误差)", + type=IssueType.STAT_CHI2_PVALUE, + message=f"卡方检验 P 值偏差 [{sub_row_desc.strip()}]: χ²={chi2_value}, 报告 P={pvalue}, 计算 P={p_calc:.4f},差异 {p_diff:.3f}", location=CellLocation( table_id=table.id, row=row_idx, - col=1 + col=pvalue_col_idx + 1, + subrow=subrow_idx ), evidence={ + "chi2_value": chi2_value, + "df": df, "p_calculated": round(p_calc, 4), "p_reported": pvalue, - "p_difference": round(p_diff, 4) + "p_difference": round(p_diff, 4), + "sub_row": sub_row_desc.strip() } )) - except (ValueError, TypeError, ZeroDivisionError) as e: - logger.debug(f"T 检验验证失败: {e}") + except (ValueError, ZeroDivisionError, TypeError) as e: + logger.debug(f"卡方检验验证失败: {e}") continue return issues + def _find_pvalue_column(self, header: List[str]) -> int: + """查找 P 值列的索引""" + p_keywords = ["p值", "pvalue", "p-value", "p 值", "sig"] + for idx, cell in enumerate(header): + cell_lower = cell.lower().strip() + for kw in p_keywords: + if kw in cell_lower: + return idx + # 默认最后一列 + return len(header) - 1 + + def _find_stat_column(self, header: List[str]) -> int: + """查找统计值列的索引(包含 χ²/t/Z 等)""" + stat_keywords = ["统计", "stat", "χ", "chi", "t值", "z值"] + for idx, cell in enumerate(header): + cell_lower = cell.lower().strip() + for kw in stat_keywords: + if kw in cell_lower: + return idx + # 默认倒数第二列 + return len(header) - 2 + + def _parse_pvalue_flexible(self, text: str) -> Optional[float]: + """ + 灵活解析 P 值 + + 支持格式: + - P=0.05 + - P<0.001 + - 0.05(直接数值) + - <0.001(全角符号) + """ + if not text: + return None + + # 先清洗特殊字符(负号归一化等) + text = _clean_number_string(text) + + # 先尝试标准 P 值格式 + match = PVALUE_PATTERN.search(text) + if match: + val = _safe_float(match.group(1)) + if val is not None: + return val + + # 处理 <0.001 或 <0.001 格式 + less_than_match = re.search(r"[<<]\s*(\d+\.?\d*)", text) + if less_than_match: + val = _safe_float(less_than_match.group(1)) + if val is not None: + return val + + # 直接尝试解析为数字 + val = _safe_float(text) + if val is not None and 0 <= val <= 1: # P 值范围检查 + return val + + return None + + def _validate_se_triangle(self, table: TableData) -> List[Issue]: """ SE 三角验证(终审提权) @@ -543,6 +973,8 @@ class StatValidator: - P_calculated = 2 * (1 - norm.cdf(|Z|)) 若报告的 P 值与计算的 P 值严重不一致,则存在问题。 + + 改进:支持多行单元格的 subrow 精确定位 """ issues: List[Issue] = [] data = table.data @@ -550,102 +982,130 @@ class StatValidator: if not SCIPY_AVAILABLE: return issues + header = data[0] if data else [] + pvalue_col_idx = self._find_pvalue_column(header) + for row_idx, row in enumerate(data[1:], start=2): + # 获取 P 值列的内容(可能有多行) + pvalue_cell = row[pvalue_col_idx] if pvalue_col_idx < len(row) else "" + pvalue_lines = pvalue_cell.split("\n") if pvalue_cell else [] + + # 获取第一列内容(用于描述) + first_cell_lines = row[0].split("\n") if row else [] + + # 将整行连接起来查找 OR/HR/RR 和 CI row_text = " ".join(row) - # 查找 OR/HR/RR - effect_match = EFFECT_SIZE_PATTERN.search(row_text) - if not effect_match: + # 查找所有 OR/HR/RR(可能有多个) + effect_matches = list(EFFECT_SIZE_PATTERN.finditer(row_text)) + if not effect_matches: continue - try: - effect_size = float(effect_match.group(1)) - if effect_size <= 0: - continue - except (ValueError, TypeError): + # 查找所有 CI + ci_matches = [] + for pattern in CI_PATTERNS: + ci_matches.extend(list(pattern.finditer(row_text))) + + if not ci_matches: continue - # 查找 CI - ci_result = self._parse_ci(row_text) - if ci_result is None: - continue - - ci_lower, ci_upper = ci_result - - # 确保 CI 有效(正数且 lower < upper) - if ci_lower <= 0 or ci_upper <= 0 or ci_lower >= ci_upper: - continue - - # 查找报告的 P 值 - pvalue = self._parse_pvalue(row_text) - if pvalue is None: - continue - - try: - # SE 三角计算 - ln_effect = math.log(effect_size) - ln_ci_lower = math.log(ci_lower) - ln_ci_upper = math.log(ci_upper) - - # SE = (ln(CI_upper) - ln(CI_lower)) / 3.92 (for 95% CI) - se = (ln_ci_upper - ln_ci_lower) / 3.92 - - if se <= 0: + # 遍历 P 值行,尝试匹配对应的 OR/CI + for line_idx, pvalue_line in enumerate(pvalue_lines): + pvalue = self._parse_pvalue_flexible(pvalue_line) + if pvalue is None: continue - # Z = ln(OR) / SE - z = abs(ln_effect) / se + # 获取当前行的描述 + row_desc = first_cell_lines[line_idx] if line_idx < len(first_cell_lines) else f"第{line_idx+1}项" - # P = 2 * (1 - norm.cdf(|Z|)) - p_calc = 2 * (1 - stats.norm.cdf(z)) - - # 比较 P 值 - p_diff = abs(p_calc - pvalue) - - if p_diff > PVALUE_ERROR_THRESHOLD: - # 严重矛盾 - issues.append(Issue( - severity=Severity.ERROR, - type=IssueType.STAT_SE_TRIANGLE, - message=f"SE 三角验证不一致: 报告 P={pvalue},由 CI 反推 P={p_calc:.4f}(差异 {p_diff:.3f})", - location=CellLocation( - table_id=table.id, - row=row_idx, - col=1 - ), - evidence={ - "effect_size": effect_size, - "ci_lower": ci_lower, - "ci_upper": ci_upper, - "se_calculated": round(se, 4), - "z_calculated": round(z, 3), - "p_calculated": round(p_calc, 4), - "p_reported": pvalue, - "p_difference": round(p_diff, 4) - } - )) - elif p_diff > PVALUE_WARNING_THRESHOLD: - # 轻微偏差,可能是舍入误差 - issues.append(Issue( - severity=Severity.WARNING, - type=IssueType.STAT_SE_TRIANGLE, - message=f"SE 三角验证轻微偏差: 报告 P={pvalue},由 CI 反推 P={p_calc:.4f}(可能是舍入误差)", - location=CellLocation( - table_id=table.id, - row=row_idx, - col=1 - ), - evidence={ - "effect_size": effect_size, - "p_calculated": round(p_calc, 4), - "p_reported": pvalue, - "p_difference": round(p_diff, 4) - } - )) + # 使用第一个有效的 OR/CI 组合进行验证 + for effect_match in effect_matches: + effect_size = _safe_float(effect_match.group(1)) + if effect_size is None or effect_size <= 0: + continue - except (ValueError, ZeroDivisionError, TypeError) as e: - logger.debug(f"SE 三角验证失败: {e}") - continue + # 查找对应的 CI + ci_result = self._parse_ci(row_text) + if ci_result is None: + continue + + ci_lower, ci_upper = ci_result + + # 确保 CI 有效 + if ci_lower <= 0 or ci_upper <= 0 or ci_lower >= ci_upper: + continue + + try: + # SE 三角计算 + ln_effect = math.log(effect_size) + ln_ci_lower = math.log(ci_lower) + ln_ci_upper = math.log(ci_upper) + + # SE = (ln(CI_upper) - ln(CI_lower)) / 3.92 (for 95% CI) + se = (ln_ci_upper - ln_ci_lower) / 3.92 + + if se <= 0: + continue + + # Z = ln(OR) / SE + z = abs(ln_effect) / se + + # P = 2 * (1 - norm.cdf(|Z|)) + p_calc = 2 * (1 - stats.norm.cdf(z)) + + # 比较 P 值 + p_diff = abs(p_calc - pvalue) + + # 计算 subrow 索引 + subrow_idx = line_idx + 1 if len(pvalue_lines) > 1 else None + + if p_diff > PVALUE_ERROR_THRESHOLD: + issues.append(Issue( + severity=Severity.ERROR, + type=IssueType.STAT_SE_TRIANGLE, + message=f"SE 三角验证不一致 [{row_desc.strip()}]: OR={effect_size}, 报告 P={pvalue},由 CI 反推 P={p_calc:.4f}", + location=CellLocation( + table_id=table.id, + row=row_idx, + col=pvalue_col_idx + 1, + subrow=subrow_idx + ), + evidence={ + "effect_size": effect_size, + "ci_lower": ci_lower, + "ci_upper": ci_upper, + "se_calculated": round(se, 4), + "z_calculated": round(z, 3), + "p_calculated": round(p_calc, 4), + "p_reported": pvalue, + "p_difference": round(p_diff, 4) + } + )) + elif p_diff > PVALUE_WARNING_THRESHOLD: + issues.append(Issue( + severity=Severity.WARNING, + type=IssueType.STAT_SE_TRIANGLE, + message=f"SE 三角验证轻微偏差 [{row_desc.strip()}]: OR={effect_size}, 报告 P={pvalue},计算 P={p_calc:.4f}", + location=CellLocation( + table_id=table.id, + row=row_idx, + col=pvalue_col_idx + 1, + subrow=subrow_idx + ), + evidence={ + "effect_size": effect_size, + "p_calculated": round(p_calc, 4), + "p_reported": pvalue, + "p_difference": round(p_diff, 4) + } + )) + + # 找到有效匹配后跳出 effect_match 循环 + break + + except (ValueError, ZeroDivisionError, TypeError) as e: + logger.debug(f"SE 三角验证失败: {e}") + continue return issues @@ -690,10 +1150,9 @@ class StatValidator: if not match: continue - try: - mean_val = float(match.group(1)) - sd_val = float(match.group(2)) - except (ValueError, TypeError): + mean_val = _safe_float(match.group(1)) + sd_val = _safe_float(match.group(2)) + if mean_val is None or sd_val is None: continue # 检查 SD > Mean(仅对 mean > 0 的情况) @@ -766,23 +1225,20 @@ class StatValidator: match = pattern.search(text) if match: try: - lower = float(match.group(1)) - upper = float(match.group(2)) - if lower < upper: # 基本合理性检查 + lower = _safe_float(match.group(1)) + upper = _safe_float(match.group(2)) + if lower is not None and upper is not None and lower < upper: return lower, upper - except (ValueError, TypeError, IndexError): + except IndexError: continue # 回退到原始的 CI_PATTERN match = CI_PATTERN.search(text) if match: - try: - lower = float(match.group(1)) - upper = float(match.group(2)) - if lower < upper: - return lower, upper - except (ValueError, TypeError): - pass + lower = _safe_float(match.group(1)) + upper = _safe_float(match.group(2)) + if lower is not None and upper is not None and lower < upper: + return lower, upper return None @@ -798,42 +1254,5 @@ class StatValidator: """ match = PVALUE_PATTERN.search(text) if match: - try: - return float(match.group(1)) - except (ValueError, TypeError): - pass + return _safe_float(match.group(1)) return None - - def _estimate_sample_sizes( - self, - table: TableData, - row_idx: int - ) -> Tuple[Optional[int], Optional[int]]: - """ - 尝试从表格中估计样本量 - - 策略: - 1. 查找表头中的 n 值 - 2. 查找 "(n=XX)" 格式 - 3. 默认返回 None - """ - data = table.data - header = data[0] if data else [] - - # 从表头查找 (n=XX) 格式 - n_pattern = re.compile(r"\(?\s*n\s*[=:]\s*(\d+)\s*\)?", re.IGNORECASE) - - n_values = [] - for cell in header: - match = n_pattern.search(cell) - if match: - try: - n_values.append(int(match.group(1))) - except ValueError: - pass - - if len(n_values) >= 2: - return n_values[0], n_values[1] - - # 如果找不到,返回 None(不进行验证) - return None, None diff --git a/extraction_service/main.py b/extraction_service/main.py index 53a6c063..38846299 100644 --- a/extraction_service/main.py +++ b/extraction_service/main.py @@ -52,9 +52,6 @@ app.add_middleware( TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service")) TEMP_DIR.mkdir(parents=True, exist_ok=True) -# 注册 RVW V2.0 数据侦探路由 -app.include_router(forensics_router) - # 导入服务模块 from services.pdf_extractor import extract_pdf_pymupdf from services.pdf_processor import extract_pdf, get_pdf_processing_strategy @@ -71,6 +68,7 @@ from services.doc_export_service import check_pandoc_available, convert_markdown # 新增:RVW V2.0 数据侦探模块 from forensics.api import router as forensics_router +app.include_router(forensics_router) # 兼容:nougat 相关(已废弃,保留空实现避免报错) def check_nougat_available(): return False diff --git a/frontend-v2/src/modules/rvw/components/EditorialReport.tsx b/frontend-v2/src/modules/rvw/components/EditorialReport.tsx index 7edf3199..e49ff493 100644 --- a/frontend-v2/src/modules/rvw/components/EditorialReport.tsx +++ b/frontend-v2/src/modules/rvw/components/EditorialReport.tsx @@ -66,7 +66,7 @@ export default function EditorialReport({ data }: EditorialReportProps) {
- {data.overall_score} + {Number(data.overall_score).toFixed(1)}
diff --git a/frontend-v2/src/modules/rvw/components/ForensicsReport.tsx b/frontend-v2/src/modules/rvw/components/ForensicsReport.tsx new file mode 100644 index 00000000..c3af88ec --- /dev/null +++ b/frontend-v2/src/modules/rvw/components/ForensicsReport.tsx @@ -0,0 +1,487 @@ +/** + * 数据验证报告组件 + * 展示 DataForensicsSkill 的表格验证结果 + */ +import { useState } from 'react'; +import { + AlertTriangle, + CheckCircle, + XCircle, + Info, + Table2, + FlaskConical, + ChevronDown, + ChevronUp, + MousePointerClick +} from 'lucide-react'; +import type { ForensicsResult, ForensicsIssue, ForensicsTable } from '../types'; + +interface ForensicsReportProps { + data: ForensicsResult; +} + +// 统计方法英文 -> 中文映射 +const METHOD_NAMES: Record = { + 'chi-square': '卡方检验', + 'mann-whitney': 'Mann-Whitney U 检验', + 't-test': 'T 检验', + 'anova': '方差分析', + 'fisher': 'Fisher 精确检验', + 'wilcoxon': 'Wilcoxon 检验', + 'kruskal-wallis': 'Kruskal-Wallis 检验', + 'mcnemar': 'McNemar 检验', + 'correlation': '相关性分析', + 'regression': '回归分析', + 'logistic': 'Logistic 回归', + 'cox': 'Cox 回归', + 'kaplan-meier': 'Kaplan-Meier 生存分析', +}; + +// 问题类型代码 -> 中文描述映射 +const ISSUE_TYPE_LABELS: Record = { + // L1 算术验证 + 'ARITHMETIC_PERCENT': '百分比计算错误', + 'ARITHMETIC_SUM': '合计计算错误', + 'ARITHMETIC_TOTAL': '总计行错误', + 'ARITHMETIC_MEAN': '均值计算错误', + + // L2 统计验证 + 'STAT_CHI2_PVALUE': '卡方检验 P 值', + 'STAT_TTEST_PVALUE': 'T 检验 P 值', + 'STAT_CI_PVALUE_CONFLICT': 'CI 与 P 值矛盾', + + // L2.5 一致性取证 + 'STAT_SE_TRIANGLE': 'SE 三角验证', + 'STAT_SD_GREATER_MEAN': 'SD 大于均值', + 'STAT_REGRESSION_CI_P': '回归 CI-P 不一致', + + // 一致性检查 + 'CONSISTENCY_DUPLICATE': '数据重复', + 'CONSISTENCY_MISMATCH': '数据不一致', + + // 提取问题 + 'EXTRACTION_WARNING': '提取警告', + 'TABLE_SKIPPED': '表格跳过', +}; + +export default function ForensicsReport({ data }: ForensicsReportProps) { + const [expandedTables, setExpandedTables] = useState>(new Set()); + const [highlightedCell, setHighlightedCell] = useState(null); + + // 防御性检查:确保所有数组和对象存在 + const tables = data?.tables || []; + const issues = data?.issues || []; + const methods = data?.methods || []; + const summary = data?.summary || { totalTables: 0, totalIssues: 0, errorCount: 0, warningCount: 0 }; + + // 创建 tableId -> caption 映射,用于显示友好的表格名称 + const tableIdToCaption: Record = {}; + tables.forEach((t, idx) => { + tableIdToCaption[t.id] = t.caption || `表格 ${idx + 1}`; + }); + + // 获取表格的友好名称 + const getTableName = (tableId: string | undefined): string => { + if (!tableId) return ''; + return tableIdToCaption[tableId] || tableId; + }; + + // 翻译统计方法名称为中文 + const translateMethod = (method: string): string => { + return METHOD_NAMES[method.toLowerCase()] || method; + }; + + // 翻译问题类型代码为中文 + const translateIssueType = (type: string): string => { + return ISSUE_TYPE_LABELS[type] || type; + }; + + const toggleTable = (tableId: string) => { + const newExpanded = new Set(expandedTables); + if (newExpanded.has(tableId)) { + newExpanded.delete(tableId); + } else { + newExpanded.add(tableId); + } + setExpandedTables(newExpanded); + }; + + const getSeverityIcon = (severity: ForensicsIssue['severity']) => { + switch (severity) { + case 'ERROR': + return ; + case 'WARNING': + return ; + case 'INFO': + return ; + } + }; + + const getSeverityColors = (severity: ForensicsIssue['severity']) => { + switch (severity) { + case 'ERROR': + return { bg: 'bg-red-50', border: 'border-red-200', text: 'text-red-700' }; + case 'WARNING': + return { bg: 'bg-amber-50', border: 'border-amber-200', text: 'text-amber-700' }; + case 'INFO': + return { bg: 'bg-blue-50', border: 'border-blue-200', text: 'text-blue-700' }; + } + }; + + const getOverallStatus = () => { + if (summary.errorCount > 0) { + return { label: '发现问题', color: 'text-red-600', bg: 'bg-red-500', icon: XCircle }; + } + if (summary.warningCount > 0) { + return { label: '需关注', color: 'text-amber-600', bg: 'bg-amber-500', icon: AlertTriangle }; + } + return { label: '数据正常', color: 'text-green-600', bg: 'bg-green-500', icon: CheckCircle }; + }; + + const status = getOverallStatus(); + const StatusIcon = status.icon; + + const handleCellClick = (cellRef: string | undefined) => { + if (cellRef) { + setHighlightedCell(highlightedCell === cellRef ? null : cellRef); + } + }; + + return ( +
+ {/* 总览卡片 */} +
+
+
+ {/* 状态图标 */} +
+
+ +
+ + {status.label} + +
+ + {/* 统计信息 */} +
+
+ +

数据验证报告

+
+

+ 已检测 {summary.totalTables} 张表格,发现 {summary.totalIssues} 个问题 + {methods.length > 0 && `,识别到统计方法:${methods.map(translateMethod).join('、')}`} +

+ + {/* 统计指标 */} +
+
+ + {summary.totalTables} 张表格 +
+ {summary.errorCount > 0 && ( +
+ + {summary.errorCount} 个错误 +
+ )} + {summary.warningCount > 0 && ( +
+ + {summary.warningCount} 个警告 +
+ )} + {summary.errorCount === 0 && summary.warningCount === 0 && ( +
+ + 未发现问题 +
+ )} +
+
+
+
+
+ + {/* 问题列表(按严重程度排序) */} + {issues.length > 0 && ( +
+
+

+ + 发现的问题 + + 共 {issues.length} 项 + +

+
+
+ {[...issues] + .sort((a, b) => { + const order = { ERROR: 0, WARNING: 1, INFO: 2 }; + return order[a.severity] - order[b.severity]; + }) + .map((issue, index) => { + const colors = getSeverityColors(issue.severity); + return ( +
handleCellClick(issue.location?.cellRef)} + > +
+ {getSeverityIcon(issue.severity)} +
+

{issue.message}

+ {issue.location && ( +

+ + {issue.location.tableId && getTableName(issue.location.tableId)} + {issue.location.cellRef && ` · 单元格 ${issue.location.cellRef}`} +

+ )} +
+ + {translateIssueType(issue.type)} + +
+
+ ); + })} +
+
+ )} + + {/* 表格详情 */} + {tables.length > 0 && ( +
+
+ +

表格详情

+ + 共 {tables.length} 张 + +
+ + {tables.map((table) => ( + toggleTable(table.id)} + highlightedCell={highlightedCell} + /> + ))} +
+ )} + + {/* 无表格提示 */} + {tables.length === 0 && ( +
+ +

未检测到表格数据

+

该文档可能不包含数据表格

+
+ )} +
+ ); +} + +/** + * 表格卡片组件 + */ +interface TableCardProps { + table: ForensicsTable; + expanded: boolean; + onToggle: () => void; + highlightedCell: string | null; +} + +function TableCard({ table, expanded, onToggle, highlightedCell }: TableCardProps) { + // 防御性检查:确保 issues 数组存在 + const issues = table.issues || []; + const hasIssues = issues.length > 0; + const errorCount = issues.filter(i => i.severity === 'ERROR').length; + const warningCount = issues.filter(i => i.severity === 'WARNING').length; + + return ( +
+ {/* 表格头部 */} +
+
+ {hasIssues ? ( + + ) : ( + + )} +
+

{table.caption || `表格 ${table.id}`}

+

+ {table.rowCount} 行 × {table.colCount} 列 + {table.skipped && ` · ⚠️ ${table.skipReason}`} +

+
+
+
+ {errorCount > 0 && ( + + {errorCount} 错误 + + )} + {warningCount > 0 && ( + + {warningCount} 警告 + + )} + {!hasIssues && ( + + 通过 + + )} + {expanded ? ( + + ) : ( + + )} +
+
+ + {/* 展开内容 */} + {expanded && ( +
+ {/* 表格渲染 */} +
+ +
+
+ + {/* 表格问题 */} + {issues.length > 0 && ( +
+
+

+ 该表格发现的问题 +

+ {issues.map((issue, idx) => ( +
+ {issue.severity === 'ERROR' ? ( + + ) : ( + + )} + {issue.message} +
+ ))} +
+
+ )} +
+ )} +
+ ); +} + +/** + * 给 HTML 表格添加高亮样式 + * 支持两种坐标: + * - data-coord="R5C4" - 单元格级别 + * - data-subcoord="R5C4S2" - 子行级别(用于多行单元格) + */ +function addHighlightToHtml( + html: string, + highlightedCell: string | null, + issues: ForensicsIssue[] +): string { + let result = html; + + // 给有问题的元素添加 has-issue 类 + for (const issue of issues) { + if (issue.location?.cellRef) { + const cellRef = issue.location.cellRef; + + // 检查是否包含子行坐标 (如 R5C4S2) + if (cellRef.includes('S')) { + // 子行级别高亮:匹配 data-subcoord + result = result.replace( + new RegExp(`data-subcoord="${cellRef}"`, 'g'), + `data-subcoord="${cellRef}" class="has-issue"` + ); + } else { + // 单元格级别高亮:匹配 data-coord(向后兼容) + result = result.replace( + new RegExp(`data-coord="${cellRef}"(?![S\\d])`, 'g'), + `data-coord="${cellRef}" class="has-issue"` + ); + } + } + } + + // 给用户点击高亮的元素添加 highlighted 类 + if (highlightedCell) { + if (highlightedCell.includes('S')) { + result = result.replace( + new RegExp(`data-subcoord="${highlightedCell}"(\\s+class="[^"]*")?`, 'g'), + (match, existingClass) => { + if (existingClass) { + return match.replace('class="', 'class="highlighted '); + } + return `data-subcoord="${highlightedCell}" class="highlighted"`; + } + ); + } else { + result = result.replace( + new RegExp(`data-coord="${highlightedCell}"(\\s+class="[^"]*")?`, 'g'), + (match, existingClass) => { + if (existingClass) { + return match.replace('class="', 'class="highlighted '); + } + return `data-coord="${highlightedCell}" class="highlighted"`; + } + ); + } + } + + return result; +} diff --git a/frontend-v2/src/modules/rvw/components/Header.tsx b/frontend-v2/src/modules/rvw/components/Header.tsx index 055c5321..f95c0bbb 100644 --- a/frontend-v2/src/modules/rvw/components/Header.tsx +++ b/frontend-v2/src/modules/rvw/components/Header.tsx @@ -1,8 +1,8 @@ /** * Dashboard头部组件 */ -import { useRef } from 'react'; -import { BrainCircuit, UploadCloud } from 'lucide-react'; +import { useRef, useState } from 'react'; +import { BrainCircuit, UploadCloud, Info, X } from 'lucide-react'; interface HeaderProps { onUpload: (files: FileList) => void; @@ -10,6 +10,7 @@ interface HeaderProps { export default function Header({ onUpload }: HeaderProps) { const fileInputRef = useRef(null); + const [showTip, setShowTip] = useState(true); const handleFileChange = (e: React.ChangeEvent) => { if (e.target.files && e.target.files.length > 0) { @@ -20,36 +21,56 @@ export default function Header({ onUpload }: HeaderProps) { }; return ( -
- {/* Logo区域 */} -
-
- +
+
+ {/* Logo区域 */} +
+
+ +
+
+

智能审稿系统

+

当前工作区:编辑部初审组

+
-
-

智能审稿系统

-

当前工作区:编辑部初审组

+ + {/* 上传按钮 */} +
+ +
- {/* 上传按钮 */} -
- - -
+ {/* 文件格式提示 */} + {showTip && ( +
+ +
+ 推荐上传 .docx 格式文件 + ,可获得完整的数据验证功能(表格算术校验、P值验证等)。 + PDF 和 .doc 格式仅支持稿约和方法学评审。 +
+ +
+ )}
); } diff --git a/frontend-v2/src/modules/rvw/components/MethodologyReport.tsx b/frontend-v2/src/modules/rvw/components/MethodologyReport.tsx index 25a08c71..12ac76a0 100644 --- a/frontend-v2/src/modules/rvw/components/MethodologyReport.tsx +++ b/frontend-v2/src/modules/rvw/components/MethodologyReport.tsx @@ -47,7 +47,7 @@ export default function MethodologyReport({ data }: MethodologyReportProps) {
- {data.overall_score} + {Number(data.overall_score).toFixed(1)}
diff --git a/frontend-v2/src/modules/rvw/components/ReportDetail.tsx b/frontend-v2/src/modules/rvw/components/ReportDetail.tsx index 0c18313b..4363ed15 100644 --- a/frontend-v2/src/modules/rvw/components/ReportDetail.tsx +++ b/frontend-v2/src/modules/rvw/components/ReportDetail.tsx @@ -2,24 +2,45 @@ * 报告详情页组件 */ import { useState } from 'react'; -import { ArrowLeft, FileCheck, Tag } from 'lucide-react'; +import { ArrowLeft, FileCheck, Tag, Info } from 'lucide-react'; import type { ReviewReport } from '../types'; import EditorialReport from './EditorialReport'; import MethodologyReport from './MethodologyReport'; +import ForensicsReport from './ForensicsReport'; interface ReportDetailProps { report: ReviewReport; onBack: () => void; } +type TabType = 'editorial' | 'methodology' | 'forensics'; + export default function ReportDetail({ report, onBack }: ReportDetailProps) { - const [activeTab, setActiveTab] = useState<'editorial' | 'methodology'>('editorial'); + const [activeTab, setActiveTab] = useState('editorial'); const hasEditorial = !!report.editorialReview; const hasMethodology = !!report.methodologyReview; + const hasForensics = !!report.forensicsResult; + + // 检查文件格式:非 .docx 文件无法进行数据验证 + const fileName = report.fileName || ''; + const isDocx = fileName.toLowerCase().endsWith('.docx'); + const isPdf = fileName.toLowerCase().endsWith('.pdf'); + const isDoc = fileName.toLowerCase().endsWith('.doc'); + const showNoForensicsTip = !hasForensics && (hasEditorial || hasMethodology) && (isPdf || isDoc); - // 如果只有方法学,默认显示方法学 - const effectiveTab = activeTab === 'editorial' && !hasEditorial && hasMethodology ? 'methodology' : activeTab; + // 智能默认 Tab 选择 + const getEffectiveTab = (): TabType => { + if (activeTab === 'editorial' && hasEditorial) return 'editorial'; + if (activeTab === 'methodology' && hasMethodology) return 'methodology'; + if (activeTab === 'forensics' && hasForensics) return 'forensics'; + // 默认优先级:editorial > methodology > forensics + if (hasEditorial) return 'editorial'; + if (hasMethodology) return 'methodology'; + if (hasForensics) return 'forensics'; + return 'editorial'; + }; + const effectiveTab = getEffectiveTab(); return (
@@ -37,12 +58,12 @@ export default function ReportDetail({ report, onBack }: ReportDetailProps) {

{report.fileName} - {report.overallScore && ( + {report.overallScore != null && ( = 80 ? 'tag-green' : report.overallScore >= 60 ? 'tag-amber' : 'tag-red' }`}> - {report.overallScore}分 + {Number(report.overallScore).toFixed(1)}分 )}

@@ -59,7 +80,7 @@ export default function ReportDetail({ report, onBack }: ReportDetailProps) { {/* 内容区域 */}
{/* Tab切换 */} - {(hasEditorial || hasMethodology) && ( + {(hasEditorial || hasMethodology || hasForensics) && (
{hasEditorial && ( + )} +
+ )} + + {/* 非 docx 文件无数据验证提示 */} + {showNoForensicsTip && ( +
+ +
+ 当前文件为 {isPdf ? 'PDF' : '.doc'} 格式, + 无法进行数据验证(表格算术校验、P值验证等)。 + 如需数据验证功能,请上传 .docx 格式文件。 +
)} @@ -95,9 +140,12 @@ export default function ReportDetail({ report, onBack }: ReportDetailProps) { {effectiveTab === 'methodology' && report.methodologyReview && ( )} + {effectiveTab === 'forensics' && report.forensicsResult && ( + + )} {/* 无数据状态 */} - {!hasEditorial && !hasMethodology && ( + {!hasEditorial && !hasMethodology && !hasForensics && (

暂无评估报告

diff --git a/frontend-v2/src/modules/rvw/components/TaskDetail.tsx b/frontend-v2/src/modules/rvw/components/TaskDetail.tsx index 2814cdab..0254be08 100644 --- a/frontend-v2/src/modules/rvw/components/TaskDetail.tsx +++ b/frontend-v2/src/modules/rvw/components/TaskDetail.tsx @@ -3,15 +3,18 @@ * 支持显示审稿进度和结果 */ import { useState, useEffect } from 'react'; -import { ArrowLeft, FileCheck, Clock, AlertCircle, CheckCircle, Loader2, FileText, Bot } from 'lucide-react'; +import { ArrowLeft, FileCheck, Clock, AlertCircle, CheckCircle, Loader2, FileText, Bot, Info } from 'lucide-react'; import { Document, Packer, Paragraph, TextRun, HeadingLevel, AlignmentType, Table, TableRow, TableCell, WidthType } from 'docx'; import { saveAs } from 'file-saver'; import type { ReviewTask, ReviewReport, TaskStatus } from '../types'; import EditorialReport from './EditorialReport'; import MethodologyReport from './MethodologyReport'; +import ForensicsReport from './ForensicsReport'; import * as api from '../api'; import { message } from 'antd'; +type TabType = 'editorial' | 'methodology' | 'forensics'; + interface TaskDetailProps { task: ReviewTask; jobId?: string | null; // pg-boss 任务ID(可选,用于更精确的状态轮询) @@ -49,7 +52,7 @@ const getProgressSteps = (selectedAgents: string[]) => { export default function TaskDetail({ task: initialTask, jobId, onBack }: TaskDetailProps) { const [task, setTask] = useState(initialTask); const [report, setReport] = useState(null); - const [activeTab, setActiveTab] = useState<'editorial' | 'methodology'>('editorial'); + const [activeTab, setActiveTab] = useState('editorial'); const [elapsedTime, setElapsedTime] = useState(0); // Suppress unused variable warning - jobId is reserved for future use @@ -110,6 +113,8 @@ export default function TaskDetail({ task: initialTask, jobId, onBack }: TaskDet setActiveTab('editorial'); } else if (report.methodologyReview) { setActiveTab('methodology'); + } else if (report.forensicsResult) { + setActiveTab('forensics'); } } }, [report]); @@ -196,7 +201,7 @@ export default function TaskDetail({ task: initialTask, jobId, onBack }: TaskDet width: { size: 2000, type: WidthType.DXA }, }), new TableCell({ - children: [new Paragraph(`${report.overallScore || '-'} 分`)], + children: [new Paragraph(`${report.overallScore != null ? Number(report.overallScore).toFixed(1) : '-'} 分`)], width: { size: 7000, type: WidthType.DXA }, }), ], @@ -532,7 +537,7 @@ export default function TaskDetail({ task: initialTask, jobId, onBack }: TaskDet 审查用时 {report.durationSeconds ? formatTime(report.durationSeconds) : '-'}

-
{report.overallScore || '-'}
+
{report.overallScore != null ? Number(report.overallScore).toFixed(1) : '-'}
@@ -562,7 +567,39 @@ export default function TaskDetail({ task: initialTask, jobId, onBack }: TaskDet 方法学评估 ({report.methodologyReview.overall_score}分) )} + {report.forensicsResult && ( + + )}
+ + {/* 非 docx 文件无数据验证提示 */} + {!report.forensicsResult && (report.editorialReview || report.methodologyReview) && (() => { + const fileName = task.fileName || ''; + const isPdf = fileName.toLowerCase().endsWith('.pdf'); + const isDoc = fileName.toLowerCase().endsWith('.doc'); + if (isPdf || isDoc) { + return ( +
+ +
+ 当前文件为 {isPdf ? 'PDF' : '.doc'} 格式, + 无法进行数据验证(表格算术校验、P值验证等)。 + 如需数据验证功能,请上传 .docx 格式文件。 +
+
+ ); + } + return null; + })()} {/* 报告内容 */} {activeTab === 'editorial' && report.editorialReview && ( @@ -571,6 +608,9 @@ export default function TaskDetail({ task: initialTask, jobId, onBack }: TaskDet {activeTab === 'methodology' && report.methodologyReview && ( )} + {activeTab === 'forensics' && report.forensicsResult && ( + + )} )}
diff --git a/frontend-v2/src/modules/rvw/types/index.ts b/frontend-v2/src/modules/rvw/types/index.ts index b755f83b..f5a7a7ce 100644 --- a/frontend-v2/src/modules/rvw/types/index.ts +++ b/frontend-v2/src/modules/rvw/types/index.ts @@ -72,10 +72,51 @@ export interface MethodologyReviewResult { parts: MethodologyPart[]; } +// 数据验证问题 +export interface ForensicsIssue { + severity: 'ERROR' | 'WARNING' | 'INFO'; + type: string; + message: string; + location?: { + tableId?: string; + cellRef?: string; + paragraph?: number; + }; + evidence?: Record; +} + +// 表格数据 +export interface ForensicsTable { + id: string; + caption: string; + html: string; + data: string[][]; + headers: string[]; + rowCount: number; + colCount: number; + skipped?: boolean; + skipReason?: string; + issues: ForensicsIssue[]; +} + +// 数据验证结果 +export interface ForensicsResult { + tables: ForensicsTable[]; + methods: string[]; + issues: ForensicsIssue[]; + summary: { + totalTables: number; + totalIssues: number; + errorCount: number; + warningCount: number; + }; +} + // 完整审查报告 export interface ReviewReport extends ReviewTask { editorialReview?: EditorialReviewResult; methodologyReview?: MethodologyReviewResult; + forensicsResult?: ForensicsResult; modelUsed?: string; }