diff --git a/backend/utils/utils.py b/backend/utils/utils.py index ee32251..f0fbc82 100644 --- a/backend/utils/utils.py +++ b/backend/utils/utils.py @@ -1,6 +1,11 @@ import string import hashlib import random +import os +from PyPDF2 import PdfReader + +FILE_NAME = 'manjil.pdf' +FILE_PATH = os.path.join(os.getcwd(), FILE_NAME) def random_string_generator(string_length: int) -> str: letters = string.ascii_letters @@ -9,3 +14,14 @@ def random_string_generator(string_length: int) -> str: def hash_string(string_value: str) ->str: return hashlib.sha256(string_value.encode('utf-8')).hexdigest() + +def read_pdf_human_readable(file_path: str) -> list[str]: + pdf_page_text_contents: list = [] + reader: PdfReader = PdfReader(file_path) + for i, page in enumerate(reader.pages): + text: str = page.extract_text() + if text: + pdf_page_text_contents.append(text.strip()) + return pdf_page_text_contents + +