hotfix(dc/tool-c): Fix compute formula validation and binning NaN serialization

Critical fixes:
1. Compute column: Add Chinese comma support in formula validation
   - Problem: Formula with Chinese comma failed validation
   - Fix: Add Chinese comma character to allowed_chars regex
   - Example: Support formulas like 'col1(kg)+ col2,col3'

2. Binning operation: Fix NaN serialization error
   - Problem: 'Out of range float values are not JSON compliant: nan'
   - Fix: Enhanced NaN/inf handling in binning endpoint
   - Added np.inf/-np.inf replacement before JSON serialization
   - Added manual JSON serialization with NaN->null conversion

3. Enhanced all operation endpoints for consistency
   - Updated conditional, dropna endpoints with same NaN/inf handling
   - Ensures all operations return JSON-compliant data

Modified files:
- extraction_service/operations/compute.py: Add Chinese comma to regex
- extraction_service/main.py: Enhanced NaN handling in binning/conditional/dropna

Status: Hotfix complete, ready for testing
This commit is contained in:
2025-12-09 08:45:27 +08:00
parent 91cab452d1
commit 75ceeb0653
79 changed files with 111 additions and 14 deletions

View File

@@ -851,15 +851,22 @@ async def operation_binning(request: BinningRequest):
request.num_bins
)
# 转换回JSON处理Categorical类型NaN值
# 转换回JSON处理Categorical类型NaN值和inf值
import numpy as np
# 1. 将Categorical列转为字符串
for col in result_df.columns:
if pd.api.types.is_categorical_dtype(result_df[col]):
result_df[col] = result_df[col].astype(str)
# 2. 将NaN替换为None避免JSON序列化错误
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 2. 替换inf和-inf为None
result_df = result_df.replace([np.inf, -np.inf], None)
# 3. 将NaN替换为None避免JSON序列化错误
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
# 4. 转换为dict
result_data = result_df.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
@@ -869,13 +876,22 @@ async def operation_binning(request: BinningRequest):
logger.info(f"分箱成功: {request.column}{request.new_column_name}")
return JSONResponse(content={
# 使用json.dumps手动序列化确保NaN完全处理
import json
response_content = {
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
}
# 手动序列化NaN会被转为null
json_str = json.dumps(response_content, allow_nan=True)
# 替换NaN为null防止任何遗漏的NaN
json_str = json_str.replace('NaN', 'null').replace('Infinity', 'null').replace('-Infinity', 'null')
return JSONResponse(content=json.loads(json_str))
except Exception as e:
sys.stdout = sys.__stdout__
@@ -937,9 +953,11 @@ async def operation_conditional(request: ConditionalRequest):
request.else_value
)
# 转换回JSON处理NaN值
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 转换回JSON处理NaN和inf值)
import numpy as np
result_df = result_df.replace([np.inf, -np.inf], None)
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
@@ -1015,9 +1033,11 @@ async def operation_dropna(request: DropnaRequest):
subset=request.subset
)
# 转换回JSON处理NaN值
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 转换回JSON处理NaN和inf值)
import numpy as np
result_df = result_df.replace([np.inf, -np.inf], None)
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__