Query Completeness¶
Demonstration of how to get complete responses to natural langauge queries using the WhyHow SDK.
Import & configure dependencies¶
from llmsherpa.readers import LayoutPDFReader
import llmsherpa
import os
import json
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from openai import OpenAI
from whyhow import Triple, Node, Relation, WhyHow
os.environ['PINECONE_API_KEY'] = "<pinecone api key>"
os.environ['OPENAI_API_KEY'] = "<openai api key>"
whyhow_client = WhyHow(api_key="<whyhow api key>", base_url="https://api.whyhow.ai")
# Requires that nlm-ingester is running - https://github.com/nlmatics/nlm-ingestor
pdf_reader = LayoutPDFReader("http://localhost:5010/api/parseDocument?renderFormat=all")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
llm = ChatOpenAI(model="gpt-4o")
llm_transformer = LLMGraphTransformer(llm=llm)
openai_client = OpenAI()
10k Documents¶
In this example, we're going to leverage 10k documents from Amazon and Walmart for the past 5 years. Specifically, we're going to use the legal proceedings reported by each of these companies. You can find these documents on the SEC website or on the Investor Relations page of each company.
# Specify docs to process
sec_10k_docs = [
{
"company": "Walmart",
"year": 2024,
"url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0000104169/b9745621-712b-43d0-b8e9-5cf33db255e3.pdf"
},
{
"company": "Walmart",
"year": 2023,
"url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0000104169/dfe6ee99-8fe6-4333-80ac-829d9e7595fa.pdf"
},
{
"company": "Walmart",
"year": 2022,
"url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0000104169/c68fb8be-2602-4f2a-aee0-261b4f04b970.pdf"
},
{
"company": "Walmart",
"year": 2021,
"url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0000104169/598c8825-536a-4371-ab8a-98b9ee761c43.pdf"
},
{
"company": "Walmart",
"year": 2020,
"url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0000104169/af5415d9-0e07-4ba1-a6cc-bb3058a7f4e8.pdf"
},
{
"company": "Amazon",
"year": 2024,
"url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/c7c14359-36fa-40c3-b3ca-5bf7f3fa0b96.pdf"
},
{
"company": "Amazon",
"year": 2023,
"url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/d2fde7ee-05f7-419d-9ce8-186de4c96e25.pdf"
},
{
"company": "Amazon",
"year": 2022,
"url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/f965e5c3-fded-45d3-bbdb-f750f156dcc9.pdf"
},
{
"company": "Amazon",
"year": 2021,
"url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/336d8745-ea82-40a5-9acc-1a89df23d0f3.pdf"
},
{
"company": "Amazon",
"year": 2020,
"url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/4d39f579-19d8-4119-b087-ee618abf82d6.pdf"
}
]
Vector database retrieval¶
Upsert to Pinecone¶
Prepare and upsert documents to Pinecone so we can test vector search.
import re
import uuid
import datetime
import itertools
from langchain.document_loaders import PyPDFLoader
from pinecone import Pinecone, Index
from typing import List, Dict, Union
pinecone_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_key)
index = pc.Index("<index name>")
def generate_namespace() -> str:
"""
Generate a unique namespace.
Returns:
str: The generated namespace.
"""
unique_id = str(uuid.uuid4())
date = datetime.date.today()
date = re.sub('[^0-9a-zA-Z ]+', '_', str(date))
namespace = '_'.join((date, unique_id))
return namespace
def split_document(doc: str) -> List[str]:
"""
Split a document into chunks.
Parameters:
doc (str): The document to split.
Returns:
List[str]: The list of chunks.
"""
loader = PyPDFLoader(doc)
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
chunks = splitter.split_documents(docs)
return chunks
def clean_chunks(chunks: List[str]) -> List[str]:
"""
Clean a list of chunks.
Parameters:
chunks (List[str]): The list of chunks to clean.
Returns:
List[str]: The list of cleaned chunks.
"""
clean_chunks = []
for chunk in chunks:
clean_chunks.append(re.sub('/(\r\n|\n|\r)/gm', '',chunk.page_content))
print(chunk)
return clean_chunks
def generate_vectors(chunks: List[str], document_id: str) -> List[Dict[str, Union[str, List[float], Dict[str, str]]]]:
"""
Generate vectors for a list of chunks.
Parameters:
chunks (List[str]): The list of chunks to generate vectors for.
document_id (str): The id of the document the chunks belong to.
Returns:
List[Dict[str, Union[str, List[float], Dict[str, str]]]]: The list of generated vectors.
"""
clean_chunk_array = []
for chunk in chunks:
# Clean up the chunk and remove unnecessary text
clean_chunk = re.sub('/(\r\n|\n|\r)/gm', '',chunk.page_content)
clean_chunk_array.append(clean_chunk)
embeddings = OpenAIEmbeddings()
embeddings_array = embeddings.embed_documents(clean_chunk_array)
vectors = []
for i, chunk in enumerate(chunks):
page_chunk_count = 0
page = chunk.metadata['page'] + 1
vectors.append({
'id': chunk.metadata['source']+'_'+str(page)+'_'+str(i),
'values': embeddings_array[i],
'metadata': {
'page': page,
'pageContent': chunk.page_content,
'filename': chunk.metadata['source'],
'documentId': document_id
},
})
return vectors
def upload_vectors(index: Index, vectors: List[Dict[str, Union[str, List[float], Dict[str, str]]]], namespace: str):
"""
Upload a list of vectors to an index.
Parameters:
index (pc.Index): The index to upload to.
vectors (List[Dict[str, Union[str, List[float], Dict[str, str]]]]): The list of vectors to upload.
namespace (str): The namespace to upload to.
"""
def chunks(iterable, batch_size=100):
it = iter(iterable)
chunk = tuple(itertools.islice(it, batch_size))
while chunk:
yield chunk
chunk = tuple(itertools.islice(it, batch_size))
for vectors_chunk in chunks(vectors, batch_size=100):
result = index.upsert(vectors=vectors_chunk, namespace=namespace)
print(result)
def upload_docs(index_name: str, documents: List[str]) -> Dict[str, Union[str, List[Dict[str, str]]]]:
"""
Upload documents to an index.
Parameters:
index_name (str): The name of the index to upload to.
documents (List[str]): The documents to upload.
Returns:
dict: A dictionary containing the namespace and the names of the uploaded documents.
"""
document_names = []
namespace = generate_namespace()
index = pc.Index(index_name)
for doc in documents:
document_id = str(uuid.uuid4())
chunks = split_document(doc)
document_names.append({'source': chunks[0].metadata['source'], 'documentId': document_id})
print(document_names)
vectors = generate_vectors(chunks, document_id)
print(len(vectors), ' vectors uploaded')
upload_vectors(index, vectors, namespace)
return {'namespace': namespace, 'documents': document_names}
docs = [
"../10k_docs/amzn_10k_2020.pdf",
"../10k_docs/amzn_10k_2021.pdf",
"../10k_docs/amzn_10k_2022.pdf",
"../10k_docs/amzn_10k_2023.pdf",
"../10k_docs/amzn_10k_2024.pdf",
"../10k_docs/wmt_10k_2020.pdf",
"../10k_docs/wmt_10k_2021.pdf",
"../10k_docs/wmt_10k_2022.pdf",
"../10k_docs/wmt_10k_2023.pdf",
"../10k_docs/wmt_10k_2024.pdf",
]
upload_docs("<index name>",docs)
Query Pinecone¶
query = "What are the lawsuits that Amazon is facing?"
chunks_extracted = 0
index = pc.Index("<index name>")
concatenatedChunks = ""
embeddings = OpenAIEmbeddings()
query_embedding = embeddings.embed_query(query)
query_response = index.query(
namespace="<namespace name>",
top_k=64,
vector=query_embedding,
include_metadata=True
)
for chunk in query_response.matches:
concatenatedChunks += chunk.metadata['pageContent'] + ' '
chunks_extracted += 1
Generate response from retrieved chunks¶
answer_prompt = f"""
Context:
You are a helpful chatbot. Your job is to anwer the following question using only the context provided in the Context below:
Question: {query}
Context: {concatenatedChunks}
Answer:"""
response = openai_client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": answer_prompt
}],
max_tokens=4000
)
print(response.choices[0].message.content)
Graph Retrieval¶
Extract triples using custom prompt¶
Simple script to extract triples using a custom prompt which we will use to instruct the LLM to build precise triples for our use case.
async def process_customprompt(section_text, company_name):
prompt = f"""
Using the content below, extract the companies involved and output it in following JSON format:
[{{"head": {{"type": "Company", "id": {company_name}}}, "relation": "INVOLVED_IN", "tail": {{"type": "Legal Proceeding", "id": <legal proceeding name>}}}}, ...]
Do not include supporting information. Do not wrap the response in JSON markers. If there is no relevant information, just return an empty array.
Content:{section_text}
"""
response = openai_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "user", "content": prompt},
],
max_tokens=4000,
temperature=0.1
)
triples = json.loads(response.choices[0].message.content)
return triples
Parse 10k Documents¶
Using llm sherpa, we will parse the 10k documents and extract the most relevant information for our use case. This will ensure we're pulling meaningful data about legal proceedings from both Walmart and Amazon's 10k filings
custom_prompt_triples = []
for pdf in sec_10k_docs:
doc = pdf_reader.read_pdf(pdf["url"])
for section in doc.sections():
if "Legal Proceedings" in section.to_text():
section_text = section.to_text(include_children=True, recurse=True)
# # Get triples from custom prompt
custom_prompt_out = await process_customprompt(section_text, pdf["company"])
custom_prompt_triples.extend(custom_prompt_out)
Format triples¶
Transform the triples into Whyhow format.
formatted_triples = []
def format_custom_triple(triple):
head = triple['head']
tail = triple['tail']
return Triple(
head=Node(name=head['id'], label=head['type']),
relation=Relation(name=triple['relation']),
tail=Node(name=tail['id'], label=tail['type'])
)
formatted_triples = [format_custom_triple(triple) for triple in custom_prompt_triples]
Build Graph¶
Using the structured Triples, create a graph using the Whyhow SDK.
# Create workspace
workspace = whyhow_client.workspaces.create(name="<workspace name>")
# Build graph from formatted triples
graph = whyhow_client.graphs.create_graph_from_triples(
workspace_id=workspace.workspace_id,
triples=formatted_triples,
name="<graph name>"
)
Query graph¶
Run a hybrid search using the structured and unstructured query methods and combine the responses to generate a response.
Build and run a structured query using the inferred graph schema.
schema_id = whyhow_client.graphs.get(graph_id=graph.graph_id).schema_id
schema = whyhow_client.schemas.get(schema_id)
entities = [entity.name for entity in schema.entities]
relations = [relation.name for relation in schema.relations]
print(entities)
print(relations)
query = "What lawsuits is Amazon dealing with?"
# Using the entities and relations from the schema, you we will use an LLM to extract relevant entities and relations from a question and make a structured query, helping guarantee completeness
prompt = f"""
Perform entity and relation extraction on the question below using the list of entity types and relation types provided.
The output should an object with three arrays: "entity_types" which are the entity types detected in the question, "relation_types" which are the relation types detected in the question, and "values" which are the relevant entity names detected in the question.
The output should look like this:
{{"entity_types": ["Person", "Place"], "relation_types": ["LIVES_IN], "values": ["John Doe", "New York"]}}
Do not include supporting information. Do not wrap the response in JSON markers. If there is no relevant information, just return an empty array.
Question:{query}
Entities:{entities}
Relations:{relations}
"""
response = openai_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "user", "content": prompt},
],
max_tokens=4000,
temperature=0.1
)
structured_query = json.loads(response.choices[0].message.content)
structured_query_response = whyhow_client.graphs.query_structured(
entities=structured_query["entity_types"],
relations=structured_query["relation_types"],
values=structured_query["values"],
graph_id=graph.graph_id)
print(structured_query_response)
Run an unstructured query
unstructured_query_response = whyhow_client.graphs.query_unstructured(query=query, graph_id=graph.graph_id)
print(unstructured_query_response)
Generate a final response by combining the results of the structured and unstructured queries.
prompt = f"""
Using the supporting information from the natural language response and the structured triples below, provide a detailed answer to the question below:
Question: {query}
Natural Language Response: {unstructured_query_response.answer}
Structured Triples: {structured_query_response}
"""
answer = openai_client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "user", "content": prompt},
],
max_tokens=4000,
temperature=0.1
)
print(answer.choices[0].message.content)