Skip to content

Parsing Questions from PDF's using Python

Python Virtual Environments


pip install pypdf
pip install spacy

python -m spacy download en_core_web_sm

Script

from pypdf import PdfReader

reader = PdfReader("example.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[10]
text = page.extract_text()



import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Input text
# text = "This is a sample text. It contains multiple sentences. Do you want to extract questions from it? If so, how should I proceed?"
text = first_100_pages

# Parse the text
doc = nlp(text)

# Extract sentences and questions
sentences = [sent.text for sent in doc.sents]
questions = [sent.text for sent in doc.sents if '?' in sent.text]

# Print sentences
# print("Sentences:")
# for sentence in sentences:
#     print(sentence)

# Print questions
print("\nQuestions:")
for question in questions:
    print(question)