I'm using the pdf.js library to extract text from PDF files, but the extracted text isn't formatted correctly, with some lines ending up at the end. The PDF file usually contains a resume, and since different resumes can have varying layouts and word structures, how can I segment the parsed text into different sections like introduction, education, and experience?
here is my code for parsing the pdf into text format
import React, { useState, useRef } from "react";
import * as pdfjs from "pdfjs-dist";
import { WorkerMessageHandler } from "pdfjs-dist/build/pdf.worker.min.mjs";
function PDFParser() {
const [extractedText, setExtractedText] = useState("");
const [pdfSrc, setPdfSrc] = useState(null);
const [selectedFileName, setSelectedFileName] = useState("");
const fileInputRef = useRef(null);
const handleFileChange = async (event) => {
const selectedFile = event.target.files[0];
if (!selectedFile) {
return;
}
const fileReader = new FileReader();
fileReader.onload = async () => {
const arrayBuffer = fileReader.result;
try {
pdfjs.GlobalWorkerOptions.workerSrc = "pdf.worker.min.mjs";
const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise;
const numPages = pdf.numPages;
let extractedText = "";
for (let i = 1; i <= numPages; i++) {
const page = await pdf.getPage(i);
const pageText = await page.getTextContent();
// Map over text items and join them with a newline character
const pageLines = pageText.items.map((item) => item.str).join("\n");
// Append the lines from this page to the extracted text
if (extractedText !== "") {
extractedText += "\n";
}
extractedText += pageLines;
}
setExtractedText(extractedText);
setPdfSrc(URL.createObjectURL(selectedFile));
setSelectedFileName(selectedFile.name);
} catch (error) {
console.error("Error parsing PDF:", error);
}
};
setExtractedText("");
fileReader.readAsArrayBuffer(selectedFile);
};
return (
<div>
<input
type="file"
onChange={handleFileChange}
accept=".pdf"
ref={fileInputRef}
style={{ display: "none" }}
/>
<button className="UploadButton" onClick={openFileDialog}>
Upload PDF
</button>
<div className="ScrollableContainer">
{extractedText && (
<HTMLContent text={extractedText}/>
)}
</div>
</div>
);
}
i have tried to convert it into html but pdfjs-dist does not allow to correctly convert it into htmL
so can someone suggest what other ways by which i can parse the text
import React, { useState, useRef } from "react";
import * as pdfjs from "pdfjs-dist";
import { WorkerMessageHandler } from "pdfjs-dist/build/pdf.worker.min.mjs";
function PDFParser() {
const [extractedText, setExtractedText] = useState("");
const [pdfSrc, setPdfSrc] = useState(null);
const [selectedFileName, setSelectedFileName] = useState("");
const fileInputRef = useRef(null);
const handleFileChange = async (event) => {
const selectedFile = event.target.files[0];
if (!selectedFile) {
return;
}
const fileReader = new FileReader();
fileReader.onload = async () => {
const arrayBuffer = fileReader.result;
try {
pdfjs.GlobalWorkerOptions.workerSrc = "pdf.worker.min.mjs";
const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise;
const numPages = pdf.numPages;
let extractedText = "";
for (let i = 1; i <= numPages; i++) {
const page = await pdf.getPage(i);
const pageText = await page.getTextContent();
// Map over text items and join them with a newline character
const pageLines = pageText.items.map((item) => item.str).join("\n");
// Append the lines from this page to the extracted text
if (extractedText !== "") {
extractedText += "\n";
}
extractedText += pageLines;
}
// Segment the extracted text into sections
const sections = segmentText(extractedText);
// Update state with segmented text
setExtractedText(sections);
setPdfSrc(URL.createObjectURL(selectedFile));
setSelectedFileName(selectedFile.name);
} catch (error) {
console.error("Error parsing PDF:", error);
}
};
setExtractedText("");
fileReader.readAsArrayBuffer(selectedFile);
};
// Function to segment text into sections
const segmentText = (text) => {
// Split text into lines
const lines = text.split("\n");
// Define section keywords
const sectionKeywords = ["education", "experience", "skills", "summary"];
// Initialize sections object
const sections = {};
// Initialize current section
let currentSection = "";
// Iterate over lines to identify section boundaries
lines.forEach((line) => {
const lowerCaseLine = line.toLowerCase();
// Check if line contains a section keyword
const matchedKeyword = sectionKeywords.find(keyword => lowerCaseLine.includes(keyword));
if (matchedKeyword) {
currentSection = matchedKeyword;
if (!sections[currentSection]) {
sections[currentSection] = [];
}
} else {
// Add line to current section
if (currentSection !== "") {
sections[currentSection].push(line);
}
}
});
return sections;
};
const openFileDialog = () => {
if (fileInputRef.current) {
fileInputRef.current.click();
}
};
return (
<div>
<input
type="file"
onChange={handleFileChange}
accept=".pdf"
ref={fileInputRef}
style={{ display: "none" }}
/>
<button className="UploadButton" onClick={openFileDialog}>
Upload PDF
</button>
<div className="ScrollableContainer">
{Object.keys(extractedText).map((section, index) => (
<div key={index}>
<h2>{section.toUpperCase()}</h2>
<ul>
{extractedText[section].map((item, idx) => (
<li key={idx}>{item}</li>
))}
</ul>
</div>
))}
</div>
</div>
);
}
export default PDFParser;