You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
29 lines
1.2 KiB
29 lines
1.2 KiB
import os
|
|
from PyPDF2 import PdfReader, PdfWriter
|
|
|
|
PATH = 'pdfcontent'
|
|
if not os.path.exists(PATH):
|
|
os.makedirs(PATH)
|
|
|
|
def getPdfFile(filepath):
|
|
with open(filepath, 'rb') as pdf_file:
|
|
reader = PdfReader(pdf_file)
|
|
for page_num in range(len(reader.pages)):
|
|
page = reader.pages[page_num]
|
|
text = page.extract_text()
|
|
|
|
# Save as text file
|
|
if text: # Ensure there's text on the page before saving
|
|
filename = os.path.splitext(os.path.basename(filepath))[0]
|
|
output_txt_filename = os.path.join(PATH, f"{filename}_page{page_num + 1}.txt")
|
|
with open(output_txt_filename, 'w', encoding='utf-8') as output_file:
|
|
output_file.write(text)
|
|
print(f"Page {page_num + 1} extracted and saved as {output_txt_filename}")
|
|
|
|
# Save as PDF file
|
|
writer = PdfWriter()
|
|
writer.add_page(page)
|
|
output_pdf_filename = os.path.join(PATH, f"{filename}_page{page_num + 1}.pdf")
|
|
with open(output_pdf_filename, 'wb') as output_pdf_file:
|
|
writer.write(output_pdf_file)
|
|
print(f"Page {page_num + 1} extracted and saved as {output_pdf_filename}")
|
|
|