Transform to MP3
Tags: #pdf #text2audio #snippet
Author: Sanjay Sabu

Input

Installing necessary packages

1
!pip install pdfminer.six
Copied!
1
!pip install gTTS
Copied!

Import library

1
from io import StringIO
2
from pdfminer.converter import TextConverter
3
from pdfminer.layout import LAParams
4
from pdfminer.pdfdocument import PDFDocument
5
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
6
from pdfminer.pdfpage import PDFPage
7
from pdfminer.pdfparser import PDFParser
8
from gtts import gTTS
Copied!

Model

Function to convert pdf file to text

1
def convert_pdf_to_string(file_path):
2
3
output_string = StringIO()
4
with open(file_path, 'rb') as in_file:
5
parser = PDFParser(in_file)
6
doc = PDFDocument(parser)
7
rsrcmgr = PDFResourceManager()
8
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
9
interpreter = PDFPageInterpreter(rsrcmgr, device)
10
for page in PDFPage.create_pages(doc):
11
interpreter.process_page(page)
12
13
return(output_string.getvalue())
14
15
16
def convert_title_to_filename(title):
17
filename = title.lower()
18
filename = filename.replace(' ', '_')
19
return filename
20
21
22
def split_to_title_and_pagenum(table_of_contents_entry):
23
title_and_pagenum = table_of_contents_entry.strip()
24
25
title = None
26
pagenum = None
27
28
if len(title_and_pagenum) > 0:
29
if title_and_pagenum[-1].isdigit():
30
i = -2
31
while title_and_pagenum[i].isdigit():
32
i -= 1
33
34
title = title_and_pagenum[:i].strip()
35
pagenum = int(title_and_pagenum[i:].strip())
36
37
return title, pagenum
38
Copied!

Output

Content

1
pdf_name = 'Installation_Guide.pdf' # .pdf file you want to convert
2
print(convert_pdf_to_string(pdf_name))
Copied!

Converting to mp3

1
rr = convert_pdf_to_string(pdf_name)
2
string_of_text = ''
3
for text in rr:
4
string_of_text += text
5
6
final_file = gTTS(text=string_of_text, lang='en') # store file in variable
7
final_file.save("Generated Speech.mp3") # save file to computer
Copied!
Last modified 2mo ago
Copy link
Edit on GitHub