From e525a6737611f851228d07ba4c668c8732c645d9 Mon Sep 17 00:00:00 2001 From: PANDACUSHION Date: Sat, 11 Jan 2025 14:09:08 +0545 Subject: [PATCH] added function to read from pdf --- backend/utils/utils.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/backend/utils/utils.py b/backend/utils/utils.py index ee32251..f0fbc82 100644 --- a/backend/utils/utils.py +++ b/backend/utils/utils.py @@ -1,6 +1,11 @@ import string import hashlib import random +import os +from PyPDF2 import PdfReader + +FILE_NAME = 'manjil.pdf' +FILE_PATH = os.path.join(os.getcwd(), FILE_NAME) def random_string_generator(string_length: int) -> str: letters = string.ascii_letters @@ -9,3 +14,14 @@ def random_string_generator(string_length: int) -> str: def hash_string(string_value: str) ->str: return hashlib.sha256(string_value.encode('utf-8')).hexdigest() + +def read_pdf_human_readable(file_path: str) -> list[str]: + pdf_page_text_contents: list = [] + reader: PdfReader = PdfReader(file_path) + for i, page in enumerate(reader.pages): + text: str = page.extract_text() + if text: + pdf_page_text_contents.append(text.strip()) + return pdf_page_text_contents + +