Skip to content

Commit

Permalink
Fix unstructured noteboook (#408)
Browse files Browse the repository at this point in the history
* remove-dotenv

* fix-doc-error

* notebook

* doc

* wget resource

* format

---------

Co-authored-by: Nicolò Boschi <boschi1997@gmail.com>
  • Loading branch information
mendonk and nicoloboschi authored May 13, 2024
1 parent f7b6b52 commit 5be69aa
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 204 deletions.
50 changes: 39 additions & 11 deletions docs/modules/examples/pages/langchain-unstructured-astra.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ DB Access Token] with Database Administrator permissions.
Install the following dependencies:
[source,python]
----
pip install ragstack-ai python-dotenv
pip install ragstack-ai
----
See the https://docs.datastax.com/en/ragstack/docs/prerequisites.html[Prerequisites] page for more details.

Expand Down Expand Up @@ -163,13 +163,15 @@ for el in elements:
if el.category in ["Header", "Footer"]:
continue # skip these
if el.category == "Title":
documents.append(current_doc)
if current_doc is not None:
documents.append(current_doc)
current_doc = None
if not current_doc:
current_doc = Document(page_content="", metadata=el.metadata.to_dict())
current_doc.page_content += el.metadata.text_as_html if el.category == "Table" else el.text
if el.category == "Table":
documents.append(current_doc)
if current_doc is not None:
documents.append(current_doc)
current_doc = None
astra_db_store.add_documents(documents)
Expand Down Expand Up @@ -197,7 +199,7 @@ chain = (

== Execute queries

. Ask a question that should be answered by the text of the document - this query should return a relevant response.
. Ask a question that should be answered by the text of the document - this query should return `Reducing the attention key size hurts model quality.`.
+
[source,python]
----
Expand All @@ -206,7 +208,9 @@ print("\n***********New Unstructured Basic Query Engine***********")
print(response_1)
----
+
. Ask a question that can be answered from the table data. This highlights the power of using Unstructured.io.
. Ask a question that can be answered from the table data.
This query should return `The 'WSJ 23 F1' value for 'Dyer et al. (2016) (5]' was 91.7.` because the table data contains this information.
This highlights the power of using Unstructured.io.
+
[source,python]
----
Expand All @@ -215,7 +219,7 @@ print("\n***********New Unstructured Basic Query Engine***********")
print(response_2)
----
. Ask a question with an expected lack of context.
This query should return `I don't know. The context does not provide any information about George Washington's birthdate.` because your document does not contain information about the George Washington.
This query should return `I don't know. The context does not provide any information about George Washington's birthdate.` because your document does not contain information about George Washington.
+
[source,python]
----
Expand All @@ -235,20 +239,25 @@ import os
import requests
from dotenv import load_dotenv
from langchain_community.document_loaders import unstructured
from langchain_astradb import AstraDBVectorStore
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_community.document_loaders import (
unstructured,
UnstructuredAPIFileLoader,
)
from langchain_openai import (
ChatOpenAI,
OpenAIEmbeddings,
)
load_dotenv()
# download pdf
url = "https://raw.githubusercontent.com/datastax/ragstack-ai/48bc55e7dc4de6a8b79fcebcedd242dc1254dd63/examples/notebooks/resources/attention_pages_9_10.pdf"
file_path = "./attention_pages_9_10.pdf"
Expand All @@ -259,8 +268,19 @@ if response.status_code == 200:
print("Download complete.")
else:
print("Error downloading the file.")
exit(1)
# simple parse
loader = UnstructuredAPIFileLoader(
file_path="./attention_pages_9_10.pdf",
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
url = os.getenv("UNSTRUCTURED_API_URL"),
)
simple_docs = loader.load()
print(len(simple_docs))
print(simple_docs[0].page_content[0:400])
# complex parse
elements = unstructured.get_elements_from_api(
file_path="./attention_pages_9_10.pdf",
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
Expand All @@ -269,31 +289,40 @@ elements = unstructured.get_elements_from_api(
pdf_infer_table_structure=True,
)
print(len(elements))
tables = [el for el in elements if el.category == "Table"]
print(tables[1].metadata.text_as_html)
# create vector store
astra_db_store = AstraDBVectorStore(
collection_name="langchain_unstructured",
embedding=OpenAIEmbeddings(),
token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"),
api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT")
)
# load documents
documents = []
current_doc = None
for el in elements:
if el.category in ["Header", "Footer"]:
continue # skip these
if el.category == "Title":
documents.append(current_doc)
if current_doc is not None:
documents.append(current_doc)
current_doc = None
if not current_doc:
current_doc = Document(page_content="", metadata=el.metadata.to_dict())
current_doc.page_content += el.metadata.text_as_html if el.category == "Table" else el.text
if el.category == "Table":
documents.append(current_doc)
if current_doc is not None:
documents.append(current_doc)
current_doc = None
astra_db_store.add_documents(documents)
# prompt and query
prompt = """
Answer the question based only on the supplied context. If you don't know the answer, say "I don't know".
Context: {context}
Expand Down Expand Up @@ -321,7 +350,6 @@ print(response_2)
response_3 = chain.invoke("When was George Washington born?")
print("\n***********New Unstructured Basic Query Engine***********")
print(response_3)
----
====

Loading

0 comments on commit 5be69aa

Please sign in to comment.