first commit
This commit is contained in:
216
src/mysql.js
Normal file
216
src/mysql.js
Normal file
@@ -0,0 +1,216 @@
|
||||
const mysql = require("mysql2/promise");
|
||||
|
||||
let pool = null;
|
||||
|
||||
function isMysqlConfigured() {
|
||||
return Boolean(process.env.MYSQL_HOST && String(process.env.MYSQL_HOST).trim());
|
||||
}
|
||||
|
||||
function getMysqlPool() {
|
||||
if (!isMysqlConfigured()) {
|
||||
return null;
|
||||
}
|
||||
if (!pool) {
|
||||
pool = mysql.createPool({
|
||||
host: process.env.MYSQL_HOST.trim(),
|
||||
port: Number(process.env.MYSQL_PORT || 3306),
|
||||
user: process.env.MYSQL_USER || "root",
|
||||
password: process.env.MYSQL_PASSWORD || "",
|
||||
database: process.env.MYSQL_DATABASE || "Resume",
|
||||
waitForConnections: true,
|
||||
connectionLimit: Number(process.env.MYSQL_POOL_SIZE || 10),
|
||||
queueLimit: 0,
|
||||
enableKeepAlive: true,
|
||||
connectTimeout: Number(process.env.MYSQL_CONNECT_TIMEOUT_MS || 15000),
|
||||
});
|
||||
}
|
||||
return pool;
|
||||
}
|
||||
|
||||
async function testMysqlConnection() {
|
||||
if (!isMysqlConfigured()) {
|
||||
return { ok: false, skipped: true, message: "未配置 MYSQL_HOST" };
|
||||
}
|
||||
const p = getMysqlPool();
|
||||
const conn = await p.getConnection();
|
||||
try {
|
||||
await conn.query("SELECT 1 AS ok");
|
||||
const [rows] = await conn.query("SELECT DATABASE() AS db");
|
||||
return {
|
||||
ok: true,
|
||||
skipped: false,
|
||||
database: rows[0]?.db || null,
|
||||
};
|
||||
} finally {
|
||||
conn.release();
|
||||
}
|
||||
}
|
||||
|
||||
async function getMysqlHealth() {
|
||||
try {
|
||||
const r = await testMysqlConnection();
|
||||
if (r.skipped) {
|
||||
return { ok: false, skipped: true };
|
||||
}
|
||||
return { ok: true, skipped: false, database: r.database };
|
||||
} catch (err) {
|
||||
return {
|
||||
ok: false,
|
||||
skipped: false,
|
||||
error: err?.message || String(err),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const CREATE_RESUME_SUBMISSIONS_SQL = `
|
||||
CREATE TABLE IF NOT EXISTS resume_submissions (
|
||||
id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
|
||||
original_filename VARCHAR(512) NOT NULL,
|
||||
mime_type VARCHAR(255) NOT NULL,
|
||||
source_type VARCHAR(32) NOT NULL,
|
||||
file_sha256 CHAR(64) NOT NULL,
|
||||
name VARCHAR(255) NULL COMMENT '姓名',
|
||||
city VARCHAR(255) NULL COMMENT '城市',
|
||||
age VARCHAR(64) NULL COMMENT '年龄',
|
||||
expected_salary VARCHAR(255) NULL COMMENT '期望薪资',
|
||||
education_experience TEXT NULL COMMENT '教育经历',
|
||||
education VARCHAR(512) NULL COMMENT '学历',
|
||||
ability TEXT NULL COMMENT '能力',
|
||||
work_experience TEXT NULL COMMENT '工作经历',
|
||||
project_experience TEXT NULL COMMENT '项目经历',
|
||||
tech_stack TEXT NULL COMMENT '技术栈',
|
||||
interview_report JSON NULL COMMENT 'AI面试题与评估',
|
||||
pdf_meta JSON NULL COMMENT 'PDF解析元信息',
|
||||
parser VARCHAR(64) NULL,
|
||||
parser_note TEXT NULL,
|
||||
warnings_json JSON NULL,
|
||||
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
PRIMARY KEY (id),
|
||||
KEY idx_file_sha256 (file_sha256),
|
||||
KEY idx_name (name(64)),
|
||||
KEY idx_city (city(64)),
|
||||
KEY idx_created_at (created_at)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||||
`;
|
||||
|
||||
async function ensureResumeTables() {
|
||||
const pool = getMysqlPool();
|
||||
if (!pool) {
|
||||
return { ok: false, skipped: true };
|
||||
}
|
||||
const conn = await pool.getConnection();
|
||||
try {
|
||||
const [legacy] = await conn.query(
|
||||
`SELECT COUNT(*) AS n FROM INFORMATION_SCHEMA.COLUMNS
|
||||
WHERE TABLE_SCHEMA = DATABASE()
|
||||
AND TABLE_NAME = 'resume_submissions'
|
||||
AND COLUMN_NAME = 'extracted_text'`
|
||||
);
|
||||
if (legacy[0]?.n > 0) {
|
||||
await conn.query("DROP TABLE IF EXISTS resume_submissions");
|
||||
console.log("[mysql] 已删除旧版 resume_submissions(含 extracted_text),将按字段重建");
|
||||
}
|
||||
await conn.query(CREATE_RESUME_SUBMISSIONS_SQL);
|
||||
return { ok: true };
|
||||
} finally {
|
||||
conn.release();
|
||||
}
|
||||
}
|
||||
|
||||
function dbRowToParsed(row) {
|
||||
if (!row) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
姓名: row.name || "",
|
||||
城市: row.city || "",
|
||||
年龄: row.age || "",
|
||||
期望薪资: row.expected_salary || "",
|
||||
教育经历: row.education_experience || "",
|
||||
学历: row.education || "",
|
||||
能力: row.ability || "",
|
||||
工作经历: row.work_experience || "",
|
||||
项目经历: row.project_experience || "",
|
||||
技术栈: row.tech_stack || "",
|
||||
};
|
||||
}
|
||||
|
||||
async function findDuplicateResumeByFileHash(fileSha256) {
|
||||
const pool = getMysqlPool();
|
||||
if (!pool || !fileSha256) {
|
||||
return null;
|
||||
}
|
||||
const [rows] = await pool.execute(
|
||||
`SELECT * FROM resume_submissions
|
||||
WHERE file_sha256 = ?
|
||||
AND interview_report IS NOT NULL
|
||||
ORDER BY id DESC
|
||||
LIMIT 1`,
|
||||
[fileSha256]
|
||||
);
|
||||
return rows[0] || null;
|
||||
}
|
||||
|
||||
async function insertResumeSubmission(row) {
|
||||
const pool = getMysqlPool();
|
||||
if (!pool) {
|
||||
return null;
|
||||
}
|
||||
const sql = `
|
||||
INSERT INTO resume_submissions (
|
||||
original_filename,
|
||||
mime_type,
|
||||
source_type,
|
||||
file_sha256,
|
||||
name,
|
||||
city,
|
||||
age,
|
||||
expected_salary,
|
||||
education_experience,
|
||||
education,
|
||||
ability,
|
||||
work_experience,
|
||||
project_experience,
|
||||
tech_stack,
|
||||
interview_report,
|
||||
pdf_meta,
|
||||
parser,
|
||||
parser_note,
|
||||
warnings_json
|
||||
) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||||
`;
|
||||
const params = [
|
||||
row.originalFilename,
|
||||
row.mimeType,
|
||||
row.sourceType,
|
||||
row.fileSha256,
|
||||
row.name ?? null,
|
||||
row.city ?? null,
|
||||
row.age ?? null,
|
||||
row.expectedSalary ?? null,
|
||||
row.educationExperience ?? null,
|
||||
row.education ?? null,
|
||||
row.ability ?? null,
|
||||
row.workExperience ?? null,
|
||||
row.projectExperience ?? null,
|
||||
row.techStack ?? null,
|
||||
row.interviewReport ?? null,
|
||||
row.pdfMeta ?? null,
|
||||
row.parser ?? null,
|
||||
row.parserNote ?? null,
|
||||
row.warningsJson ?? null,
|
||||
];
|
||||
const [result] = await pool.execute(sql, params);
|
||||
return result.insertId;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
getMysqlPool,
|
||||
testMysqlConnection,
|
||||
getMysqlHealth,
|
||||
isMysqlConfigured,
|
||||
ensureResumeTables,
|
||||
insertResumeSubmission,
|
||||
findDuplicateResumeByFileHash,
|
||||
dbRowToParsed,
|
||||
};
|
||||
874
src/server.js
Normal file
874
src/server.js
Normal file
@@ -0,0 +1,874 @@
|
||||
const path = require("path");
|
||||
const crypto = require("crypto");
|
||||
const { pathToFileURL } = require("url");
|
||||
const express = require("express");
|
||||
const cors = require("cors");
|
||||
const multer = require("multer");
|
||||
const mammoth = require("mammoth");
|
||||
const { PDFParse } = require("pdf-parse");
|
||||
const { createWorker } = require("tesseract.js");
|
||||
const OpenAI = require("openai");
|
||||
const {
|
||||
getMysqlHealth,
|
||||
testMysqlConnection,
|
||||
ensureResumeTables,
|
||||
insertResumeSubmission,
|
||||
getMysqlPool,
|
||||
findDuplicateResumeByFileHash,
|
||||
dbRowToParsed,
|
||||
} = require("./mysql");
|
||||
const {
|
||||
saveSession,
|
||||
getSession,
|
||||
deleteSession,
|
||||
} = require("./upload-session-cache");
|
||||
require("dotenv").config();
|
||||
|
||||
const app = express();
|
||||
app.use(cors());
|
||||
app.use(express.json({ limit: "1mb" }));
|
||||
// 注意:静态资源必须在 API 路由之后挂载,否则部分环境下 /api/* 可能被错误处理
|
||||
|
||||
const resumeFields = [
|
||||
"姓名",
|
||||
"城市",
|
||||
"年龄",
|
||||
"期望薪资",
|
||||
"教育经历",
|
||||
"学历",
|
||||
"能力",
|
||||
"工作经历",
|
||||
"项目经历",
|
||||
"技术栈",
|
||||
];
|
||||
|
||||
const deepseekClient = process.env.DEEPSEEK_API_KEY
|
||||
? new OpenAI({
|
||||
baseURL: "https://api.deepseek.com",
|
||||
apiKey: process.env.DEEPSEEK_API_KEY,
|
||||
})
|
||||
: null;
|
||||
|
||||
function parseResumeText(text) {
|
||||
const normalizedText = String(text || "")
|
||||
.replace(/\r\n/g, "\n")
|
||||
.replace(/\u3000/g, " ")
|
||||
.trim();
|
||||
|
||||
const parsed = Object.fromEntries(resumeFields.map((field) => [field, ""]));
|
||||
|
||||
for (const field of resumeFields) {
|
||||
const otherFields = resumeFields.filter((item) => item !== field).map((item) => item.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
|
||||
const nextFieldPattern = otherFields.join("|");
|
||||
const fieldPattern = field.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
const regex = new RegExp(
|
||||
`${fieldPattern}\\s*[::]\\s*([\\s\\S]*?)(?=\\s*(?:${nextFieldPattern})\\s*[::]|$)`,
|
||||
"i"
|
||||
);
|
||||
const match = normalizedText.match(regex);
|
||||
|
||||
if (match) {
|
||||
parsed[field] = match[1].replace(/\s+/g, " ").trim();
|
||||
}
|
||||
}
|
||||
|
||||
return parsed;
|
||||
}
|
||||
|
||||
function normalizeParsedResume(data) {
|
||||
const normalized = Object.fromEntries(resumeFields.map((field) => [field, ""]));
|
||||
|
||||
if (!data || typeof data !== "object" || Array.isArray(data)) {
|
||||
return normalized;
|
||||
}
|
||||
|
||||
for (const field of resumeFields) {
|
||||
const value = data[field];
|
||||
normalized[field] = typeof value === "string" ? value.trim() : value == null ? "" : String(value).trim();
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function normalizeStringArray(value) {
|
||||
if (Array.isArray(value)) {
|
||||
return value
|
||||
.map((item) => (item == null ? "" : String(item).trim()))
|
||||
.filter(Boolean);
|
||||
}
|
||||
if (typeof value === "string") {
|
||||
return value
|
||||
.split(/[,,、;;]/)
|
||||
.map((item) => item.trim())
|
||||
.filter(Boolean);
|
||||
}
|
||||
return [];
|
||||
}
|
||||
|
||||
function normalizeInterviewQuestions(value) {
|
||||
if (!Array.isArray(value)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return value
|
||||
.map((item) => ({
|
||||
question: typeof item?.question === "string" ? item.question.trim() : "",
|
||||
standardAnswer:
|
||||
typeof item?.standardAnswer === "string"
|
||||
? item.standardAnswer.trim()
|
||||
: "",
|
||||
}))
|
||||
.filter((item) => item.question && item.standardAnswer);
|
||||
}
|
||||
|
||||
function getDefaultInterviewQuestions() {
|
||||
const defaults = [
|
||||
"请简要介绍你最熟悉的一个项目,并说明你负责的核心模块。",
|
||||
"你在该项目中解决过最复杂的技术问题是什么?如何定位与解决?",
|
||||
"如果线上出现性能瓶颈,你会如何做排查和优化?",
|
||||
"你如何保证代码质量和可维护性?",
|
||||
"请解释你常用技术栈中的一个核心原理。",
|
||||
"在多人协作开发中,你如何进行任务拆分与沟通?",
|
||||
"你如何设计一个可扩展的后端接口?",
|
||||
"你做过哪些稳定性保障措施(监控、告警、容错等)?",
|
||||
"遇到需求变更时,你如何评估影响并快速调整?",
|
||||
"请描述一次你主导或关键参与的优化成果(可量化最好)。",
|
||||
];
|
||||
|
||||
return defaults.map((question) => ({
|
||||
question,
|
||||
standardAnswer:
|
||||
"答案应包含明确场景、技术方案、实施步骤、结果数据和复盘总结,体现候选人的技术深度与工程思维。",
|
||||
}));
|
||||
}
|
||||
|
||||
function normalizeInterviewReport(data) {
|
||||
const report = {
|
||||
interviewQuestions: [],
|
||||
abilitySummary: "",
|
||||
rating: {
|
||||
level: "",
|
||||
score: "",
|
||||
reason: "",
|
||||
},
|
||||
techStack: [],
|
||||
strengths: [],
|
||||
weaknesses: [],
|
||||
};
|
||||
|
||||
if (!data || typeof data !== "object" || Array.isArray(data)) {
|
||||
report.interviewQuestions = getDefaultInterviewQuestions();
|
||||
return report;
|
||||
}
|
||||
|
||||
report.interviewQuestions = normalizeInterviewQuestions(data.interviewQuestions);
|
||||
if (report.interviewQuestions.length < 10) {
|
||||
const defaults = getDefaultInterviewQuestions();
|
||||
report.interviewQuestions = [
|
||||
...report.interviewQuestions,
|
||||
...defaults.slice(report.interviewQuestions.length, 10),
|
||||
];
|
||||
}
|
||||
|
||||
report.abilitySummary =
|
||||
typeof data.abilitySummary === "string" ? data.abilitySummary.trim() : "";
|
||||
|
||||
report.rating.level =
|
||||
typeof data?.rating?.level === "string" ? data.rating.level.trim() : "";
|
||||
report.rating.score =
|
||||
data?.rating?.score == null ? "" : String(data.rating.score).trim();
|
||||
report.rating.reason =
|
||||
typeof data?.rating?.reason === "string" ? data.rating.reason.trim() : "";
|
||||
|
||||
report.techStack = normalizeStringArray(data.techStack);
|
||||
report.strengths = normalizeStringArray(data.strengths);
|
||||
report.weaknesses = normalizeStringArray(data.weaknesses);
|
||||
|
||||
return report;
|
||||
}
|
||||
|
||||
function extractJsonObject(content) {
|
||||
if (!content) {
|
||||
throw new Error("AI 未返回内容");
|
||||
}
|
||||
|
||||
const fencedMatch = content.match(/```json\s*([\s\S]*?)\s*```/i);
|
||||
if (fencedMatch) {
|
||||
return JSON.parse(fencedMatch[1]);
|
||||
}
|
||||
|
||||
const start = content.indexOf("{");
|
||||
const end = content.lastIndexOf("}");
|
||||
if (start === -1 || end === -1 || end <= start) {
|
||||
throw new Error("AI 返回内容不是合法 JSON");
|
||||
}
|
||||
|
||||
return JSON.parse(content.slice(start, end + 1));
|
||||
}
|
||||
|
||||
async function parseResumeWithAI(text) {
|
||||
if (!deepseekClient) {
|
||||
return {
|
||||
parsed: parseResumeText(text),
|
||||
interviewReport: normalizeInterviewReport(null),
|
||||
parser: "rule",
|
||||
parserNote: "未配置 DEEPSEEK_API_KEY,已使用规则解析,面试题使用默认模板",
|
||||
};
|
||||
}
|
||||
|
||||
const prompt = [
|
||||
"请从下面的简历原文中完成结构化提取和面试评估。",
|
||||
"你只能返回一个合法 JSON,不要输出任何解释文字。",
|
||||
"JSON 顶层必须包含两个键:parsed 和 interviewReport。",
|
||||
"parsed 字段键名必须严格为:姓名、城市、年龄、期望薪资、教育经历、学历、能力、工作经历、技术栈、项目经历。",
|
||||
"parsed 中字段缺失时填空字符串。",
|
||||
"interviewReport 的结构必须为:",
|
||||
"{",
|
||||
' "interviewQuestions": [{"question":"问题1","standardAnswer":"标准答案1"}],',
|
||||
' "abilitySummary": "能力总结",',
|
||||
' "rating": {"level":"S/A/B/C","score":"0-100","reason":"评级原因"},',
|
||||
' "techStack": ["技术1","技术2"],',
|
||||
' "strengths": ["优点1","优点2"],',
|
||||
' "weaknesses": ["缺点1","缺点2"]',
|
||||
"}",
|
||||
"interviewQuestions 至少生成 10 条,并且每条都要贴合该候选人简历内容。",
|
||||
"简历原文:",
|
||||
text,
|
||||
].join("\n");
|
||||
|
||||
try {
|
||||
const completion = await deepseekClient.chat.completions.create({
|
||||
model: "deepseek-chat",
|
||||
temperature: 0.1,
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
content:
|
||||
"你是资深技术面试官与简历评估助手。你只能返回合法 JSON,禁止输出 markdown 代码块和解释文本。",
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
content: prompt,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const content = completion.choices?.[0]?.message?.content || "";
|
||||
const json = extractJsonObject(content);
|
||||
const parsed = normalizeParsedResume(json?.parsed);
|
||||
const interviewReport = normalizeInterviewReport(json?.interviewReport);
|
||||
|
||||
return {
|
||||
parsed,
|
||||
interviewReport,
|
||||
parser: "ai",
|
||||
parserNote: "已使用 DeepSeek 进行结构化解析与面试评估",
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
parsed: parseResumeText(text),
|
||||
interviewReport: normalizeInterviewReport(null),
|
||||
parser: "rule",
|
||||
parserNote: `AI 解析失败,已回退规则解析,面试题使用默认模板:${error.message}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
let pdfjsModulePromise;
|
||||
async function getPdfJsModule() {
|
||||
if (!pdfjsModulePromise) {
|
||||
const pdfMainPath = require.resolve("pdfjs-dist/legacy/build/pdf.mjs");
|
||||
pdfjsModulePromise = import(pathToFileURL(pdfMainPath).href);
|
||||
}
|
||||
const pdfjs = await pdfjsModulePromise;
|
||||
const workerPath = require.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs");
|
||||
pdfjs.GlobalWorkerOptions.workerSrc = pathToFileURL(workerPath).href;
|
||||
return pdfjs;
|
||||
}
|
||||
|
||||
function withTimeout(promise, ms, label) {
|
||||
return Promise.race([
|
||||
promise,
|
||||
new Promise((_, reject) => {
|
||||
setTimeout(() => {
|
||||
reject(new Error(`${label} 超时(${ms}ms)`));
|
||||
}, ms);
|
||||
}),
|
||||
]);
|
||||
}
|
||||
|
||||
let tesseractWorkerPromise;
|
||||
let tesseractWorkerLang = "";
|
||||
async function getTesseractWorker() {
|
||||
const lang = process.env.PDF_OCR_LANG || "chi_sim";
|
||||
if (tesseractWorkerPromise && tesseractWorkerLang !== lang) {
|
||||
const old = await tesseractWorkerPromise.catch(() => null);
|
||||
if (old && typeof old.terminate === "function") {
|
||||
await old.terminate().catch(() => {});
|
||||
}
|
||||
tesseractWorkerPromise = null;
|
||||
}
|
||||
if (!tesseractWorkerPromise) {
|
||||
tesseractWorkerLang = lang;
|
||||
const initMs = Number(process.env.PDF_OCR_INIT_TIMEOUT_MS || 120000);
|
||||
tesseractWorkerPromise = withTimeout(
|
||||
createWorker(lang),
|
||||
initMs,
|
||||
"Tesseract 初始化(首次需下载模型,请保持网络畅通)"
|
||||
);
|
||||
}
|
||||
try {
|
||||
return await tesseractWorkerPromise;
|
||||
} catch (error) {
|
||||
tesseractWorkerPromise = null;
|
||||
tesseractWorkerLang = "";
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
let ocrQueue = Promise.resolve();
|
||||
function enqueueOcr(task) {
|
||||
const next = ocrQueue.then(() => task(), () => task());
|
||||
ocrQueue = next.catch(() => {});
|
||||
return next;
|
||||
}
|
||||
|
||||
function pdfMeaningfulCharCount(text) {
|
||||
return String(text || "")
|
||||
.replace(/[\s\u200b-\u200d\ufeff]/g, "")
|
||||
.trim().length;
|
||||
}
|
||||
|
||||
function shouldRunPdfOcr(text) {
|
||||
if (process.env.PDF_OCR_DISABLED === "1") {
|
||||
return false;
|
||||
}
|
||||
if (process.env.PDF_OCR_ENABLED !== "1") {
|
||||
return false;
|
||||
}
|
||||
const min = Number(process.env.PDF_OCR_MIN_CHARS || 80);
|
||||
return pdfMeaningfulCharCount(text) < min;
|
||||
}
|
||||
|
||||
async function extractPdfTextViaOcr(pdfBuffer) {
|
||||
const { createCanvas } = require("@napi-rs/canvas");
|
||||
const pdfjs = await getPdfJsModule();
|
||||
const data = new Uint8Array(pdfBuffer);
|
||||
const loadingTask = pdfjs.getDocument({
|
||||
data,
|
||||
useSystemFonts: true,
|
||||
});
|
||||
|
||||
let doc;
|
||||
try {
|
||||
doc = await loadingTask.promise;
|
||||
} catch (error) {
|
||||
await loadingTask.destroy().catch(() => {});
|
||||
throw error;
|
||||
}
|
||||
|
||||
const maxPages = Math.min(
|
||||
doc.numPages,
|
||||
Number(process.env.PDF_OCR_MAX_PAGES || 5)
|
||||
);
|
||||
const scale = Number(process.env.PDF_OCR_SCALE || 2);
|
||||
const worker = await getTesseractWorker();
|
||||
const chunks = [];
|
||||
|
||||
try {
|
||||
for (let pageNum = 1; pageNum <= maxPages; pageNum += 1) {
|
||||
const page = await doc.getPage(pageNum);
|
||||
const viewport = page.getViewport({ scale });
|
||||
const width = Math.max(1, Math.ceil(viewport.width));
|
||||
const height = Math.max(1, Math.ceil(viewport.height));
|
||||
const canvas = createCanvas(width, height);
|
||||
const ctx = canvas.getContext("2d");
|
||||
ctx.fillStyle = "#ffffff";
|
||||
ctx.fillRect(0, 0, width, height);
|
||||
const renderTask = page.render({ canvasContext: ctx, viewport });
|
||||
await renderTask.promise;
|
||||
const pngBuffer = canvas.toBuffer("image/png");
|
||||
const ocrTimeout = Number(process.env.PDF_OCR_PAGE_TIMEOUT_MS || 120000);
|
||||
const { data } = await withTimeout(
|
||||
worker.recognize(pngBuffer),
|
||||
ocrTimeout,
|
||||
`第 ${pageNum} 页 OCR`
|
||||
);
|
||||
const pageText = String(data.text || "").trim();
|
||||
if (pageText) {
|
||||
chunks.push(pageText);
|
||||
}
|
||||
page.cleanup();
|
||||
}
|
||||
} finally {
|
||||
await loadingTask.destroy().catch(() => {});
|
||||
}
|
||||
|
||||
return {
|
||||
text: chunks.join("\n\n"),
|
||||
pagesProcessed: maxPages,
|
||||
};
|
||||
}
|
||||
|
||||
async function extractTextFromUpload(file) {
|
||||
const ext = path.extname(file.originalname || "").toLowerCase();
|
||||
|
||||
if (ext === ".docx") {
|
||||
const result = await mammoth.extractRawText({ buffer: file.buffer });
|
||||
return {
|
||||
text: result.value || "",
|
||||
warnings: result.messages || [],
|
||||
sourceType: "docx",
|
||||
};
|
||||
}
|
||||
|
||||
if (ext === ".pdf") {
|
||||
const parser = new PDFParse({ data: file.buffer });
|
||||
let result;
|
||||
try {
|
||||
result = await parser.getText();
|
||||
} finally {
|
||||
await parser.destroy();
|
||||
}
|
||||
|
||||
const primaryText = result.text || "";
|
||||
const minChars = Number(process.env.PDF_OCR_MIN_CHARS || 80);
|
||||
const pdfExtraction = {
|
||||
textLayerChars: pdfMeaningfulCharCount(primaryText),
|
||||
ocrLang: process.env.PDF_OCR_LANG || "chi_sim",
|
||||
ocrEnabled: process.env.PDF_OCR_ENABLED === "1",
|
||||
ocrAttempted: false,
|
||||
ocrPages: 0,
|
||||
ocrUsed: false,
|
||||
ocrError: "",
|
||||
ocrSkippedReason: "",
|
||||
};
|
||||
|
||||
let finalText = primaryText;
|
||||
const warnings = [...(result.messages || [])];
|
||||
|
||||
if (
|
||||
pdfMeaningfulCharCount(primaryText) < minChars &&
|
||||
process.env.PDF_OCR_ENABLED !== "1"
|
||||
) {
|
||||
pdfExtraction.ocrSkippedReason =
|
||||
"文本层较短,疑似扫描件;未设置 PDF_OCR_ENABLED=1,已跳过 OCR(可在 .env 开启并保证网络可下载 Tesseract 模型)";
|
||||
}
|
||||
|
||||
if (shouldRunPdfOcr(primaryText)) {
|
||||
pdfExtraction.ocrAttempted = true;
|
||||
try {
|
||||
const ocrResult = await enqueueOcr(() =>
|
||||
extractPdfTextViaOcr(file.buffer)
|
||||
);
|
||||
pdfExtraction.ocrPages = ocrResult.pagesProcessed;
|
||||
if (
|
||||
pdfMeaningfulCharCount(ocrResult.text) >
|
||||
pdfMeaningfulCharCount(finalText)
|
||||
) {
|
||||
finalText = ocrResult.text;
|
||||
pdfExtraction.ocrUsed = true;
|
||||
}
|
||||
} catch (error) {
|
||||
pdfExtraction.ocrError = error.message;
|
||||
warnings.push(`PDF OCR 失败:${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
text: finalText,
|
||||
warnings,
|
||||
sourceType: "pdf",
|
||||
pdfExtraction,
|
||||
};
|
||||
}
|
||||
|
||||
throw new Error(`不支持的文件类型:${ext || "未知"}。当前仅支持 .docx 和 .pdf`);
|
||||
}
|
||||
|
||||
// 内存接收文件:便于直接把 buffer 交给 mammoth 解析
|
||||
const upload = multer({
|
||||
storage: multer.memoryStorage(),
|
||||
limits: { fileSize: 20 * 1024 * 1024 }, // 20MB
|
||||
fileFilter: (req, file, cb) => {
|
||||
const ext = path.extname(file.originalname || "").toLowerCase();
|
||||
const allowed = [".docx", ".pdf"];
|
||||
if (!allowed.includes(ext)) {
|
||||
return cb(
|
||||
new Error(`不支持的文件类型:${ext || "未知"}。当前仅支持 .docx 和 .pdf`),
|
||||
false
|
||||
);
|
||||
}
|
||||
cb(null, true);
|
||||
},
|
||||
});
|
||||
|
||||
app.get("/health", async (req, res) => {
|
||||
const mysql = await getMysqlHealth();
|
||||
res.json({ ok: true, mysql });
|
||||
});
|
||||
|
||||
app.get("/", (req, res) => {
|
||||
res.sendFile(path.join(__dirname, "..", "public", "index.html"));
|
||||
});
|
||||
|
||||
function normalizeJsonColumn(value, fallback) {
|
||||
if (value == null) {
|
||||
return fallback;
|
||||
}
|
||||
if (typeof value === "object") {
|
||||
return value;
|
||||
}
|
||||
if (typeof value === "string") {
|
||||
try {
|
||||
return JSON.parse(value);
|
||||
} catch {
|
||||
return fallback;
|
||||
}
|
||||
}
|
||||
return fallback;
|
||||
}
|
||||
|
||||
/** 未配置 MySQL 时,后台 AI 完成后暂存于此,供轮询与 consult-ai 读取 */
|
||||
const memoryAiByHash = new Map();
|
||||
|
||||
/**
|
||||
* 后台:会话调模型并写入数据库(或内存),完成后删除会话。
|
||||
*/
|
||||
async function runResumeAiPersistInBackground(uploadId) {
|
||||
const session = getSession(uploadId);
|
||||
if (!session) {
|
||||
return;
|
||||
}
|
||||
let persisted = false;
|
||||
try {
|
||||
const aiResult = await parseResumeWithAI(session.extractedText);
|
||||
const p = aiResult.parsed || {};
|
||||
|
||||
const payload = {
|
||||
originalFilename: session.originalFilename,
|
||||
mimeType: session.mimeType,
|
||||
sourceType: session.sourceType,
|
||||
fileSha256: session.fileSha256,
|
||||
name: p.姓名 || null,
|
||||
city: p.城市 || null,
|
||||
age: p.年龄 || null,
|
||||
expectedSalary: p.期望薪资 || null,
|
||||
educationExperience: p.教育经历 || null,
|
||||
education: p.学历 || null,
|
||||
ability: p.能力 || null,
|
||||
workExperience: p.工作经历 || null,
|
||||
projectExperience: p.项目经历 || null,
|
||||
techStack: p.技术栈 || null,
|
||||
interviewReport: aiResult.interviewReport,
|
||||
pdfMeta: session.pdfExtraction || null,
|
||||
parser: aiResult.parser,
|
||||
parserNote: aiResult.parserNote,
|
||||
warningsJson: session.warnings,
|
||||
};
|
||||
|
||||
if (getMysqlPool()) {
|
||||
try {
|
||||
await insertResumeSubmission(payload);
|
||||
persisted = true;
|
||||
} catch (err) {
|
||||
console.error("[resume-ai-bg] 写入数据库失败,已写入内存供轮询:", err?.message || err);
|
||||
memoryAiByHash.set(session.fileSha256, {
|
||||
parsed: aiResult.parsed,
|
||||
interviewReport: aiResult.interviewReport,
|
||||
parser: aiResult.parser,
|
||||
parserNote: aiResult.parserNote,
|
||||
warnings: session.warnings,
|
||||
});
|
||||
persisted = true;
|
||||
}
|
||||
} else {
|
||||
memoryAiByHash.set(session.fileSha256, {
|
||||
parsed: aiResult.parsed,
|
||||
interviewReport: aiResult.interviewReport,
|
||||
parser: aiResult.parser,
|
||||
parserNote: aiResult.parserNote,
|
||||
warnings: session.warnings,
|
||||
});
|
||||
persisted = true;
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("[resume-ai-bg] 生成失败:", err?.message || err);
|
||||
} finally {
|
||||
if (persisted) {
|
||||
deleteSession(uploadId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 第一步:上传 + 文本抽取 + 查重(不调 AI)
|
||||
app.post("/api/resume/step1", upload.single("file"), async (req, res, next) => {
|
||||
try {
|
||||
if (!req.file) {
|
||||
return res.status(400).json({ error: "缺少上传文件:请使用 form-data 字段 file" });
|
||||
}
|
||||
|
||||
const extracted = await extractTextFromUpload(req.file);
|
||||
const fileSha256 = crypto
|
||||
.createHash("sha256")
|
||||
.update(req.file.buffer)
|
||||
.digest("hex");
|
||||
|
||||
const parsedPreview = parseResumeText(extracted.text);
|
||||
|
||||
if (getMysqlPool()) {
|
||||
const duplicateRow = await findDuplicateResumeByFileHash(fileSha256);
|
||||
if (duplicateRow) {
|
||||
const parsed = dbRowToParsed(duplicateRow);
|
||||
const interviewReport = normalizeJsonColumn(
|
||||
duplicateRow.interview_report,
|
||||
null
|
||||
);
|
||||
const warnings = normalizeJsonColumn(duplicateRow.warnings_json, []);
|
||||
return res.json({
|
||||
step: 1,
|
||||
fileSha256,
|
||||
uploadId: null,
|
||||
isDuplicate: true,
|
||||
duplicateOfId: duplicateRow.id,
|
||||
aiReady: true,
|
||||
fromCache: true,
|
||||
text: extracted.text,
|
||||
pdfExtraction: extracted.pdfExtraction,
|
||||
parsedPreview: parsed,
|
||||
interviewReport,
|
||||
parser: duplicateRow.parser,
|
||||
parserNote: duplicateRow.parser_note,
|
||||
warnings: Array.isArray(warnings) ? warnings : [],
|
||||
hint: "该文件与库中记录重复,已直接返回库内 AI 结果,无需再次调用模型。",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const uploadId = crypto.randomUUID();
|
||||
saveSession(uploadId, {
|
||||
extractedText: extracted.text,
|
||||
fileSha256,
|
||||
originalFilename: req.file.originalname,
|
||||
mimeType: req.file.mimetype,
|
||||
sourceType: extracted.sourceType,
|
||||
pdfExtraction: extracted.pdfExtraction,
|
||||
warnings: extracted.warnings,
|
||||
});
|
||||
|
||||
setImmediate(() => {
|
||||
runResumeAiPersistInBackground(uploadId).catch((err) => {
|
||||
console.error("[resume-ai-bg] 未捕获:", err?.message || err);
|
||||
});
|
||||
});
|
||||
|
||||
return res.json({
|
||||
step: 1,
|
||||
fileSha256,
|
||||
uploadId,
|
||||
isDuplicate: false,
|
||||
duplicateOfId: null,
|
||||
aiReady: false,
|
||||
fromCache: false,
|
||||
text: extracted.text,
|
||||
pdfExtraction: extracted.pdfExtraction,
|
||||
parsedPreview,
|
||||
interviewReport: null,
|
||||
parser: null,
|
||||
parserNote: null,
|
||||
warnings: extracted.warnings,
|
||||
hint: "AI 正在后台生成并写入数据库,就绪后「咨询 AI」将可点击;也可通过页面轮询获知。",
|
||||
});
|
||||
} catch (err) {
|
||||
next(err);
|
||||
}
|
||||
});
|
||||
|
||||
/** 轮询:数据库(或内存)中是否已有该文件的 AI 评估结果 */
|
||||
app.get("/api/resume/ai-ready", async (req, res, next) => {
|
||||
try {
|
||||
const fileSha256 = req.query.fileSha256;
|
||||
if (!fileSha256 || typeof fileSha256 !== "string") {
|
||||
return res.status(400).json({ error: "缺少 fileSha256" });
|
||||
}
|
||||
|
||||
if (getMysqlPool()) {
|
||||
const row = await findDuplicateResumeByFileHash(fileSha256);
|
||||
if (row) {
|
||||
return res.json({
|
||||
ready: true,
|
||||
duplicateOfId: row.id,
|
||||
parserNote: row.parser_note || null,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (memoryAiByHash.has(fileSha256)) {
|
||||
return res.json({ ready: true, fromMemory: true });
|
||||
}
|
||||
return res.json({ ready: false });
|
||||
} catch (err) {
|
||||
next(err);
|
||||
}
|
||||
});
|
||||
|
||||
// 第二步:仅从数据库(或内存缓存)读取已落库的 AI 结果;不再在此处同步调模型
|
||||
app.post("/api/resume/consult-ai", async (req, res, next) => {
|
||||
try {
|
||||
const { fileSha256 } = req.body || {};
|
||||
if (!fileSha256 || typeof fileSha256 !== "string") {
|
||||
return res.status(400).json({ error: "缺少 fileSha256" });
|
||||
}
|
||||
|
||||
if (getMysqlPool()) {
|
||||
const dup = await findDuplicateResumeByFileHash(fileSha256);
|
||||
if (dup) {
|
||||
const parsed = dbRowToParsed(dup);
|
||||
const interviewReport = normalizeJsonColumn(dup.interview_report, null);
|
||||
const warnings = normalizeJsonColumn(dup.warnings_json, []);
|
||||
return res.json({
|
||||
fromCache: true,
|
||||
isDuplicate: true,
|
||||
duplicateOfId: dup.id,
|
||||
parsed,
|
||||
interviewReport,
|
||||
parser: dup.parser,
|
||||
parserNote: `${dup.parser_note || ""}(库内读取)`.trim(),
|
||||
warnings: Array.isArray(warnings) ? warnings : [],
|
||||
db: { saved: false, skipped: true, reason: "read_db" },
|
||||
});
|
||||
}
|
||||
return res.status(409).json({
|
||||
error:
|
||||
"库中尚未找到该文件的 AI 评估,请等待后台写入完成后再试(页面会自动轮询)。",
|
||||
});
|
||||
}
|
||||
|
||||
const mem = memoryAiByHash.get(fileSha256);
|
||||
if (mem) {
|
||||
return res.json({
|
||||
fromCache: true,
|
||||
isDuplicate: false,
|
||||
duplicateOfId: null,
|
||||
parsed: mem.parsed,
|
||||
interviewReport: mem.interviewReport,
|
||||
parser: mem.parser,
|
||||
parserNote: `${mem.parserNote || ""}(内存缓存)`.trim(),
|
||||
warnings: Array.isArray(mem.warnings) ? mem.warnings : [],
|
||||
db: { saved: false, skipped: true, reason: "memory" },
|
||||
});
|
||||
}
|
||||
|
||||
return res.status(409).json({
|
||||
error:
|
||||
"尚未找到 AI 评估结果,请等待后台生成完成后再试(未配置数据库时使用内存缓存)。",
|
||||
});
|
||||
} catch (err) {
|
||||
next(err);
|
||||
}
|
||||
});
|
||||
|
||||
// 接收 multipart/form-data:字段名必须是 file
|
||||
// 返回解析后的纯文本(raw text)
|
||||
app.post("/api/parse-word", upload.single("file"), async (req, res, next) => {
|
||||
try {
|
||||
if (!req.file) {
|
||||
return res.status(400).json({ error: "缺少上传文件:请使用 form-data 字段 file" });
|
||||
}
|
||||
|
||||
const extracted = await extractTextFromUpload(req.file);
|
||||
const aiResult = await parseResumeWithAI(extracted.text);
|
||||
|
||||
const fileSha256 = crypto
|
||||
.createHash("sha256")
|
||||
.update(req.file.buffer)
|
||||
.digest("hex");
|
||||
|
||||
const payload = {
|
||||
filename: req.file.originalname,
|
||||
mimeType: req.file.mimetype,
|
||||
sourceType: extracted.sourceType,
|
||||
text: extracted.text,
|
||||
pdfExtraction: extracted.pdfExtraction,
|
||||
parsed: aiResult.parsed,
|
||||
interviewReport: aiResult.interviewReport,
|
||||
parser: aiResult.parser,
|
||||
parserNote: aiResult.parserNote,
|
||||
warnings: extracted.warnings,
|
||||
};
|
||||
|
||||
let db = { saved: false, skipped: true };
|
||||
if (getMysqlPool()) {
|
||||
db = { saved: false, skipped: false };
|
||||
try {
|
||||
const p = aiResult.parsed || {};
|
||||
const insertId = await insertResumeSubmission({
|
||||
originalFilename: req.file.originalname,
|
||||
mimeType: req.file.mimetype,
|
||||
sourceType: extracted.sourceType,
|
||||
fileSha256,
|
||||
name: p.姓名 || null,
|
||||
city: p.城市 || null,
|
||||
age: p.年龄 || null,
|
||||
expectedSalary: p.期望薪资 || null,
|
||||
educationExperience: p.教育经历 || null,
|
||||
education: p.学历 || null,
|
||||
ability: p.能力 || null,
|
||||
workExperience: p.工作经历 || null,
|
||||
projectExperience: p.项目经历 || null,
|
||||
techStack: p.技术栈 || null,
|
||||
interviewReport: aiResult.interviewReport,
|
||||
pdfMeta: extracted.pdfExtraction || null,
|
||||
parser: aiResult.parser,
|
||||
parserNote: aiResult.parserNote,
|
||||
warningsJson: extracted.warnings,
|
||||
});
|
||||
db = { saved: true, id: insertId };
|
||||
} catch (err) {
|
||||
console.error("[mysql] 写入 resume_submissions 失败:", err?.message || err);
|
||||
db = {
|
||||
saved: false,
|
||||
skipped: false,
|
||||
error: err?.message || String(err),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return res.json({ ...payload, db });
|
||||
} catch (err) {
|
||||
next(err);
|
||||
}
|
||||
});
|
||||
|
||||
app.use(express.static(path.join(__dirname, "..", "public")));
|
||||
|
||||
// 统一错误处理
|
||||
app.use((err, req, res, next) => {
|
||||
const status =
|
||||
err && err.message && /不支持的文件类型/.test(err.message) ? 415 : 500;
|
||||
return res.status(status).json({
|
||||
error: err?.message || "服务端错误",
|
||||
});
|
||||
});
|
||||
|
||||
const PORT = Number(process.env.PORT || 3000);
|
||||
app.listen(PORT, async () => {
|
||||
console.log(`Word parse API running on http://localhost:${PORT}`);
|
||||
try {
|
||||
const mysqlResult = await testMysqlConnection();
|
||||
if (mysqlResult.skipped) {
|
||||
console.log("[mysql] 未配置 MYSQL_HOST,跳过连接");
|
||||
} else if (mysqlResult.ok) {
|
||||
console.log(`[mysql] 已连接,当前库: ${mysqlResult.database || "未知"}`);
|
||||
try {
|
||||
await ensureResumeTables();
|
||||
console.log("[mysql] 数据表 resume_submissions 已就绪");
|
||||
} catch (tableErr) {
|
||||
console.error("[mysql] 建表失败:", tableErr?.message || tableErr);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("[mysql] 连接失败:", err?.message || err);
|
||||
}
|
||||
});
|
||||
|
||||
42
src/upload-session-cache.js
Normal file
42
src/upload-session-cache.js
Normal file
@@ -0,0 +1,42 @@
|
||||
const TTL_MS = Number(process.env.UPLOAD_SESSION_TTL_MS || 60 * 60 * 1000);
|
||||
|
||||
const store = new Map();
|
||||
|
||||
function saveSession(uploadId, payload) {
|
||||
const record = {
|
||||
...payload,
|
||||
expiresAt: Date.now() + TTL_MS,
|
||||
};
|
||||
store.set(uploadId, record);
|
||||
setTimeout(() => {
|
||||
const cur = store.get(uploadId);
|
||||
if (cur === record) {
|
||||
store.delete(uploadId);
|
||||
}
|
||||
}, TTL_MS);
|
||||
}
|
||||
|
||||
function getSession(uploadId) {
|
||||
if (!uploadId) {
|
||||
return null;
|
||||
}
|
||||
const v = store.get(uploadId);
|
||||
if (!v) {
|
||||
return null;
|
||||
}
|
||||
if (Date.now() > v.expiresAt) {
|
||||
store.delete(uploadId);
|
||||
return null;
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
function deleteSession(uploadId) {
|
||||
store.delete(uploadId);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
saveSession,
|
||||
getSession,
|
||||
deleteSession,
|
||||
};
|
||||
Reference in New Issue
Block a user