def convert_pdf_to_string(file_path):
output_string = StringIO()
with open(file_path, 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
return(output_string.getvalue())
def convert_title_to_filename(title):
filename = filename.replace(' ', '_')
def split_to_title_and_pagenum(table_of_contents_entry):
title_and_pagenum = table_of_contents_entry.strip()
if len(title_and_pagenum) > 0:
if title_and_pagenum[-1].isdigit():
while title_and_pagenum[i].isdigit():
title = title_and_pagenum[:i].strip()
pagenum = int(title_and_pagenum[i:].strip())