I am developing a program that converts pdf documents to excel files. There are about 1000 pdf files to convert. But I got this error while processing the 234th file. Here's the important code.
private void getFullFileContent()
{
Excel.Application objExcel = new Excel.Application();
objExcel.Visible = true;
Excel.Workbook objBook = objExcel.Workbooks.Add(System.Reflection.Missing.Value);
Excel.Worksheet objSheet;
Excel.Range objRange=null;
DateTime endDate, startDate=System.DateTime.Now;
string[] pdfFiles = Directory.GetFiles(folderBrowserDialog1.SelectedPath, "*.pdf");
for (int i = 1; i <= pdfFiles.Length; i++)
{
objSheet = (Excel.Worksheet)objBook.Worksheets.get_Item(1);
string bColumn = "B" + i,aColumn="A"+i;
objRange = objSheet.get_Range(aColumn, System.Reflection.Missing.Value);
objRange.set_Value(System.Reflection.Missing.Value, pdfFiles[i-1].Substring(pdfFiles[i-1].LastIndexOf('\\') + 1));
objRange = objSheet.get_Range(bColumn, System.Reflection.Missing.Value);
objRange.set_Value(System.Reflection.Missing.Value, ConvertPdfToText(pdfFiles[i - 1]));
label4.Text = pdfFiles[i].Substring(pdfFiles[i].LastIndexOf('\\')+1);
label6.Text = "Remaining File: "+(pdfFiles.Length - i).ToString();
endDate = System.DateTime.Now;
label5.Text = "Total time: " + (endDate - startDate).Hours.ToString() + ":" + (endDate - startDate).Minutes.ToString() + ":" + (endDate - startDate).Seconds.ToString();
GC.Collect();
}
try
{
objBook.SaveAs("Training Data.xlsx");
MessageBox.Show("Your PDF files converted and printed into \"Training Data.xlsx\"");
}
catch (Exception)
{
throw;
}
}
private string ConvertPdfToText(string path)
{
StringBuilder text = new StringBuilder();
string fileName = path;
string strFileContent = "";
if (File.Exists(fileName))
{
PdfReader pdfReader = new PdfReader(fileName);
for (int page = 1; page <= pdfReader.NumberOfPages; page++)
{
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
text.Append(currentText);
strFileContent += currentText;
}
pdfReader.Close();
}
return strFileContent;
}
How large are your files? Maybe you could try using a memory profiler like RedGate Ants or JetBrains dotTrace to figure out where memory is leaking (while using a smaller batch).
Also this piece of code could be outside the for loop from getFullFileContent():
objSheet = (Excel.Worksheet)objBook.Worksheets.get_Item(1)
User contributions licensed under CC BY-SA 3.0