Query Completeness¶

Demonstration of how to get complete responses to natural langauge queries using the WhyHow SDK.

Import & configure dependencies¶

In [42]:

Copied!





from llmsherpa.readers import LayoutPDFReader
import llmsherpa
import os
import json


from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings

from openai import OpenAI
from whyhow import Triple, Node, Relation, WhyHow

os.environ['PINECONE_API_KEY'] = "<pinecone api key>"
os.environ['OPENAI_API_KEY'] = "<openai api key>"

whyhow_client = WhyHow(api_key="<whyhow api key>", base_url="https://api.whyhow.ai")

# Requires that nlm-ingester is running - https://github.com/nlmatics/nlm-ingestor
pdf_reader = LayoutPDFReader("http://localhost:5010/api/parseDocument?renderFormat=all")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)

llm = ChatOpenAI(model="gpt-4o")
llm_transformer = LLMGraphTransformer(llm=llm)


openai_client = OpenAI()
from llmsherpa.readers import LayoutPDFReader
import llmsherpa
import os
import json


from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings

from openai import OpenAI
from whyhow import Triple, Node, Relation, WhyHow

os.environ['PINECONE_API_KEY'] = ""
os.environ['OPENAI_API_KEY'] = ""

whyhow_client = WhyHow(api_key="", base_url="https://api.whyhow.ai")

# Requires that nlm-ingester is running - https://github.com/nlmatics/nlm-ingestor
pdf_reader = LayoutPDFReader("http://localhost:5010/api/parseDocument?renderFormat=all")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)

llm = ChatOpenAI(model="gpt-4o")
llm_transformer = LLMGraphTransformer(llm=llm)


openai_client = OpenAI()

10k Documents¶

In this example, we're going to leverage 10k documents from Amazon and Walmart for the past 5 years. Specifically, we're going to use the legal proceedings reported by each of these companies. You can find these documents on the SEC website or on the Investor Relations page of each company.

In [47]:

Copied!





# Specify docs to process
sec_10k_docs = [
    {
        "company": "Walmart",
        "year": 2024,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0000104169/b9745621-712b-43d0-b8e9-5cf33db255e3.pdf"
    },
    {
        "company": "Walmart",
        "year": 2023,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0000104169/dfe6ee99-8fe6-4333-80ac-829d9e7595fa.pdf"
    },
    {
        "company": "Walmart",
        "year": 2022,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0000104169/c68fb8be-2602-4f2a-aee0-261b4f04b970.pdf"
    },
    {
        "company": "Walmart",
        "year": 2021,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0000104169/598c8825-536a-4371-ab8a-98b9ee761c43.pdf"
    },
    {
        "company": "Walmart",
        "year": 2020,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0000104169/af5415d9-0e07-4ba1-a6cc-bb3058a7f4e8.pdf"
    },
    {
        "company": "Amazon",
        "year": 2024,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/c7c14359-36fa-40c3-b3ca-5bf7f3fa0b96.pdf"
    },
    {
        "company": "Amazon",
        "year": 2023,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/d2fde7ee-05f7-419d-9ce8-186de4c96e25.pdf"
    },
    {
        "company": "Amazon",
        "year": 2022,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/f965e5c3-fded-45d3-bbdb-f750f156dcc9.pdf"
    },
    {
        "company": "Amazon",
        "year": 2021,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/336d8745-ea82-40a5-9acc-1a89df23d0f3.pdf"
    },
    {
        "company": "Amazon",
        "year": 2020,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/4d39f579-19d8-4119-b087-ee618abf82d6.pdf"
    }
]
# Specify docs to process
sec_10k_docs = [
    {
        "company": "Walmart",
        "year": 2024,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0000104169/b9745621-712b-43d0-b8e9-5cf33db255e3.pdf"
    },
    {
        "company": "Walmart",
        "year": 2023,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0000104169/dfe6ee99-8fe6-4333-80ac-829d9e7595fa.pdf"
    },
    {
        "company": "Walmart",
        "year": 2022,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0000104169/c68fb8be-2602-4f2a-aee0-261b4f04b970.pdf"
    },
    {
        "company": "Walmart",
        "year": 2021,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0000104169/598c8825-536a-4371-ab8a-98b9ee761c43.pdf"
    },
    {
        "company": "Walmart",
        "year": 2020,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0000104169/af5415d9-0e07-4ba1-a6cc-bb3058a7f4e8.pdf"
    },
    {
        "company": "Amazon",
        "year": 2024,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/c7c14359-36fa-40c3-b3ca-5bf7f3fa0b96.pdf"
    },
    {
        "company": "Amazon",
        "year": 2023,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/d2fde7ee-05f7-419d-9ce8-186de4c96e25.pdf"
    },
    {
        "company": "Amazon",
        "year": 2022,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/f965e5c3-fded-45d3-bbdb-f750f156dcc9.pdf"
    },
    {
        "company": "Amazon",
        "year": 2021,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/336d8745-ea82-40a5-9acc-1a89df23d0f3.pdf"
    },
    {
        "company": "Amazon",
        "year": 2020,
        "url": "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/4d39f579-19d8-4119-b087-ee618abf82d6.pdf"
    }
]

Vector database retrieval¶

Upsert to Pinecone¶

Prepare and upsert documents to Pinecone so we can test vector search.

In [ ]:

Copied!





import re 
import uuid
import datetime
import itertools

from langchain.document_loaders import PyPDFLoader
from pinecone import Pinecone, Index
from typing import List, Dict, Union

pinecone_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_key)
index = pc.Index("<index name>")

def generate_namespace() -> str:
    """
    Generate a unique namespace.

    Returns:
    str: The generated namespace.
    """
    unique_id = str(uuid.uuid4())
    date = datetime.date.today()
    date = re.sub('[^0-9a-zA-Z ]+', '_', str(date))
    namespace = '_'.join((date, unique_id))
    return namespace

def split_document(doc: str) -> List[str]:
    """
    Split a document into chunks.

    Parameters:
    doc (str): The document to split.

    Returns:
    List[str]: The list of chunks.
    """
    loader = PyPDFLoader(doc)
    docs = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
    chunks = splitter.split_documents(docs)
    return chunks

def clean_chunks(chunks: List[str]) -> List[str]:
    """
    Clean a list of chunks.

    Parameters:
    chunks (List[str]): The list of chunks to clean.

    Returns:
    List[str]: The list of cleaned chunks.
    """
    clean_chunks = []
    for chunk in chunks:
      clean_chunks.append(re.sub('/(\r\n|\n|\r)/gm', '',chunk.page_content))
      print(chunk)

    return clean_chunks

def generate_vectors(chunks: List[str], document_id: str) -> List[Dict[str, Union[str, List[float], Dict[str, str]]]]:
    """
    Generate vectors for a list of chunks.

    Parameters:
    chunks (List[str]): The list of chunks to generate vectors for.
    document_id (str): The id of the document the chunks belong to.

    Returns:
    List[Dict[str, Union[str, List[float], Dict[str, str]]]]: The list of generated vectors.
    """

    clean_chunk_array = []

    for chunk in chunks:

      # Clean up the chunk and remove unnecessary text
      clean_chunk = re.sub('/(\r\n|\n|\r)/gm', '',chunk.page_content)
      clean_chunk_array.append(clean_chunk)


    embeddings = OpenAIEmbeddings()
    embeddings_array = embeddings.embed_documents(clean_chunk_array)

    vectors = []

    for i, chunk in enumerate(chunks):
      page_chunk_count = 0
      page = chunk.metadata['page'] + 1
      vectors.append({
        'id': chunk.metadata['source']+'_'+str(page)+'_'+str(i),
        'values': embeddings_array[i],
        'metadata': {
          'page': page,
          'pageContent': chunk.page_content,
          'filename': chunk.metadata['source'],
          'documentId': document_id
        },
      })

    return vectors

def upload_vectors(index: Index, vectors: List[Dict[str, Union[str, List[float], Dict[str, str]]]], namespace: str):
    """
    Upload a list of vectors to an index.

    Parameters:
    index (pc.Index): The index to upload to.
    vectors (List[Dict[str, Union[str, List[float], Dict[str, str]]]]): The list of vectors to upload.
    namespace (str): The namespace to upload to.
    """
    def chunks(iterable, batch_size=100):
        it = iter(iterable)
        chunk = tuple(itertools.islice(it, batch_size))
        while chunk:
            yield chunk
            chunk = tuple(itertools.islice(it, batch_size))

    for vectors_chunk in chunks(vectors, batch_size=100):
        result = index.upsert(vectors=vectors_chunk, namespace=namespace)
        print(result)

def upload_docs(index_name: str, documents: List[str]) -> Dict[str, Union[str, List[Dict[str, str]]]]:
    """
    Upload documents to an index.

    Parameters:
    index_name (str): The name of the index to upload to.
    documents (List[str]): The documents to upload.

    Returns:
    dict: A dictionary containing the namespace and the names of the uploaded documents.
    """
    document_names = []
    namespace = generate_namespace()
    index = pc.Index(index_name)

    for doc in documents:
        document_id = str(uuid.uuid4())
        chunks = split_document(doc)
        document_names.append({'source': chunks[0].metadata['source'], 'documentId': document_id})
        print(document_names)
        vectors = generate_vectors(chunks, document_id)
        print(len(vectors), ' vectors uploaded')
        upload_vectors(index, vectors, namespace)

    return {'namespace': namespace, 'documents': document_names}

docs = [
    "../10k_docs/amzn_10k_2020.pdf",
    "../10k_docs/amzn_10k_2021.pdf",
    "../10k_docs/amzn_10k_2022.pdf",
    "../10k_docs/amzn_10k_2023.pdf",
    "../10k_docs/amzn_10k_2024.pdf",
    "../10k_docs/wmt_10k_2020.pdf",
    "../10k_docs/wmt_10k_2021.pdf",
    "../10k_docs/wmt_10k_2022.pdf",
    "../10k_docs/wmt_10k_2023.pdf",
    "../10k_docs/wmt_10k_2024.pdf",
]

upload_docs("<index name>",docs)
import re 
import uuid
import datetime
import itertools

from langchain.document_loaders import PyPDFLoader
from pinecone import Pinecone, Index
from typing import List, Dict, Union

pinecone_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_key)
index = pc.Index("")

def generate_namespace() -> str:
    """
    Generate a unique namespace.

    Returns:
    str: The generated namespace.
    """
    unique_id = str(uuid.uuid4())
    date = datetime.date.today()
    date = re.sub('[^0-9a-zA-Z ]+', '_', str(date))
    namespace = '_'.join((date, unique_id))
    return namespace

def split_document(doc: str) -> List[str]:
    """
    Split a document into chunks.

    Parameters:
    doc (str): The document to split.

    Returns:
    List[str]: The list of chunks.
    """
    loader = PyPDFLoader(doc)
    docs = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
    chunks = splitter.split_documents(docs)
    return chunks

def clean_chunks(chunks: List[str]) -> List[str]:
    """
    Clean a list of chunks.

    Parameters:
    chunks (List[str]): The list of chunks to clean.

    Returns:
    List[str]: The list of cleaned chunks.
    """
    clean_chunks = []
    for chunk in chunks:
      clean_chunks.append(re.sub('/(\r\n|\n|\r)/gm', '',chunk.page_content))
      print(chunk)

    return clean_chunks

def generate_vectors(chunks: List[str], document_id: str) -> List[Dict[str, Union[str, List[float], Dict[str, str]]]]:
    """
    Generate vectors for a list of chunks.

    Parameters:
    chunks (List[str]): The list of chunks to generate vectors for.
    document_id (str): The id of the document the chunks belong to.

    Returns:
    List[Dict[str, Union[str, List[float], Dict[str, str]]]]: The list of generated vectors.
    """

    clean_chunk_array = []

    for chunk in chunks:

      # Clean up the chunk and remove unnecessary text
      clean_chunk = re.sub('/(\r\n|\n|\r)/gm', '',chunk.page_content)
      clean_chunk_array.append(clean_chunk)


    embeddings = OpenAIEmbeddings()
    embeddings_array = embeddings.embed_documents(clean_chunk_array)

    vectors = []

    for i, chunk in enumerate(chunks):
      page_chunk_count = 0
      page = chunk.metadata['page'] + 1
      vectors.append({
        'id': chunk.metadata['source']+'_'+str(page)+'_'+str(i),
        'values': embeddings_array[i],
        'metadata': {
          'page': page,
          'pageContent': chunk.page_content,
          'filename': chunk.metadata['source'],
          'documentId': document_id
        },
      })

    return vectors

def upload_vectors(index: Index, vectors: List[Dict[str, Union[str, List[float], Dict[str, str]]]], namespace: str):
    """
    Upload a list of vectors to an index.

    Parameters:
    index (pc.Index): The index to upload to.
    vectors (List[Dict[str, Union[str, List[float], Dict[str, str]]]]): The list of vectors to upload.
    namespace (str): The namespace to upload to.
    """
    def chunks(iterable, batch_size=100):
        it = iter(iterable)
        chunk = tuple(itertools.islice(it, batch_size))
        while chunk:
            yield chunk
            chunk = tuple(itertools.islice(it, batch_size))

    for vectors_chunk in chunks(vectors, batch_size=100):
        result = index.upsert(vectors=vectors_chunk, namespace=namespace)
        print(result)

def upload_docs(index_name: str, documents: List[str]) -> Dict[str, Union[str, List[Dict[str, str]]]]:
    """
    Upload documents to an index.

    Parameters:
    index_name (str): The name of the index to upload to.
    documents (List[str]): The documents to upload.

    Returns:
    dict: A dictionary containing the namespace and the names of the uploaded documents.
    """
    document_names = []
    namespace = generate_namespace()
    index = pc.Index(index_name)

    for doc in documents:
        document_id = str(uuid.uuid4())
        chunks = split_document(doc)
        document_names.append({'source': chunks[0].metadata['source'], 'documentId': document_id})
        print(document_names)
        vectors = generate_vectors(chunks, document_id)
        print(len(vectors), ' vectors uploaded')
        upload_vectors(index, vectors, namespace)

    return {'namespace': namespace, 'documents': document_names}

docs = [
    "../10k_docs/amzn_10k_2020.pdf",
    "../10k_docs/amzn_10k_2021.pdf",
    "../10k_docs/amzn_10k_2022.pdf",
    "../10k_docs/amzn_10k_2023.pdf",
    "../10k_docs/amzn_10k_2024.pdf",
    "../10k_docs/wmt_10k_2020.pdf",
    "../10k_docs/wmt_10k_2021.pdf",
    "../10k_docs/wmt_10k_2022.pdf",
    "../10k_docs/wmt_10k_2023.pdf",
    "../10k_docs/wmt_10k_2024.pdf",
]

upload_docs("",docs)

Query Pinecone¶

In [ ]:

Copied!





query = "What are the lawsuits that Amazon is facing?"

chunks_extracted = 0

index = pc.Index("<index name>")

concatenatedChunks = ""
embeddings = OpenAIEmbeddings()
query_embedding = embeddings.embed_query(query)

query_response = index.query(
        namespace="<namespace name>",
        top_k=64,
        vector=query_embedding,
        include_metadata=True
    )

for chunk in query_response.matches:
    concatenatedChunks += chunk.metadata['pageContent'] + ' '
    chunks_extracted += 1
query = "What are the lawsuits that Amazon is facing?"

chunks_extracted = 0

index = pc.Index("")

concatenatedChunks = ""
embeddings = OpenAIEmbeddings()
query_embedding = embeddings.embed_query(query)

query_response = index.query(
        namespace="",
        top_k=64,
        vector=query_embedding,
        include_metadata=True
    )

for chunk in query_response.matches:
    concatenatedChunks += chunk.metadata['pageContent'] + ' '
    chunks_extracted += 1

Generate response from retrieved chunks¶

In [ ]:

Copied!





answer_prompt = f"""
        Context:
        You are a helpful chatbot. Your job is to anwer the following question using only the context provided in the Context below:

        Question: {query}
        Context: {concatenatedChunks}

        Answer:"""

response = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[{
        "role": "user",
        "content": answer_prompt
    }],
    max_tokens=4000
)

print(response.choices[0].message.content)
answer_prompt = f"""
        Context:
        You are a helpful chatbot. Your job is to anwer the following question using only the context provided in the Context below:

        Question: {query}
        Context: {concatenatedChunks}

        Answer:"""

response = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[{
        "role": "user",
        "content": answer_prompt
    }],
    max_tokens=4000
)

print(response.choices[0].message.content)

Graph Retrieval¶

Extract triples using custom prompt¶

Simple script to extract triples using a custom prompt which we will use to instruct the LLM to build precise triples for our use case.

In [5]:

Copied!





async def process_customprompt(section_text, company_name):

    prompt = f"""
        Using the content below, extract the companies involved and output it in following JSON format:

        [{{"head": {{"type": "Company", "id": {company_name}}}, "relation": "INVOLVED_IN", "tail": {{"type": "Legal Proceeding", "id": <legal proceeding name>}}}}, ...]

        Do not include supporting information. Do not wrap the response in JSON markers. If there is no relevant information, just return an empty array.
        Content:{section_text}
    """
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt},
        ],
        max_tokens=4000,
        temperature=0.1
    )

    triples = json.loads(response.choices[0].message.content)
    return triples
async def process_customprompt(section_text, company_name):

    prompt = f"""
        Using the content below, extract the companies involved and output it in following JSON format:

        [{{"head": {{"type": "Company", "id": {company_name}}}, "relation": "INVOLVED_IN", "tail": {{"type": "Legal Proceeding", "id": }}}}, ...]

        Do not include supporting information. Do not wrap the response in JSON markers. If there is no relevant information, just return an empty array.
        Content:{section_text}
    """
    response = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt},
        ],
        max_tokens=4000,
        temperature=0.1
    )

    triples = json.loads(response.choices[0].message.content)
    return triples 

Parse 10k Documents¶

Using llm sherpa, we will parse the 10k documents and extract the most relevant information for our use case. This will ensure we're pulling meaningful data about legal proceedings from both Walmart and Amazon's 10k filings

In [48]:

Copied!





custom_prompt_triples = []

for pdf in sec_10k_docs:
    doc = pdf_reader.read_pdf(pdf["url"])

    for section in doc.sections():

        if "Legal Proceedings" in section.to_text():
            section_text = section.to_text(include_children=True, recurse=True)

            # # Get triples from custom prompt
            custom_prompt_out = await process_customprompt(section_text, pdf["company"])
            custom_prompt_triples.extend(custom_prompt_out)
custom_prompt_triples = []

for pdf in sec_10k_docs:
    doc = pdf_reader.read_pdf(pdf["url"])

    for section in doc.sections():

        if "Legal Proceedings" in section.to_text():
            section_text = section.to_text(include_children=True, recurse=True)

            # # Get triples from custom prompt
            custom_prompt_out = await process_customprompt(section_text, pdf["company"])
            custom_prompt_triples.extend(custom_prompt_out)

Format triples¶

Transform the triples into Whyhow format.

In [ ]:

Copied!





formatted_triples = []

def format_custom_triple(triple):

    head = triple['head']
    tail = triple['tail']
    
    return Triple(
        head=Node(name=head['id'], label=head['type']),
        relation=Relation(name=triple['relation']),      
        tail=Node(name=tail['id'], label=tail['type']) 
    )

formatted_triples = [format_custom_triple(triple) for triple in custom_prompt_triples]
formatted_triples = []

def format_custom_triple(triple):

    head = triple['head']
    tail = triple['tail']
    
    return Triple(
        head=Node(name=head['id'], label=head['type']),
        relation=Relation(name=triple['relation']),      
        tail=Node(name=tail['id'], label=tail['type']) 
    )

formatted_triples = [format_custom_triple(triple) for triple in custom_prompt_triples]

Build Graph¶

Using the structured Triples, create a graph using the Whyhow SDK.

In [51]:

Copied!





# Create workspace
workspace = whyhow_client.workspaces.create(name="<workspace name>")

# Build graph from formatted triples
graph = whyhow_client.graphs.create_graph_from_triples(
    workspace_id=workspace.workspace_id,
    triples=formatted_triples,
    name="<graph name>"
)
# Create workspace
workspace = whyhow_client.workspaces.create(name="")

# Build graph from formatted triples
graph = whyhow_client.graphs.create_graph_from_triples(
    workspace_id=workspace.workspace_id,
    triples=formatted_triples,
    name=""
)

Query graph¶

Run a hybrid search using the structured and unstructured query methods and combine the responses to generate a response.

Build and run a structured query using the inferred graph schema.

In [ ]:

Copied!





schema_id = whyhow_client.graphs.get(graph_id=graph.graph_id).schema_id

schema = whyhow_client.schemas.get(schema_id)

entities = [entity.name for entity in schema.entities]
relations = [relation.name for relation in schema.relations]


print(entities)
print(relations)

query = "What lawsuits is Amazon dealing with?"

# Using the entities and relations from the schema, you we will use an LLM to extract relevant entities and relations from a question and make a structured query, helping guarantee completeness
prompt = f"""
    Perform entity and relation extraction on the question below using the list of entity types and relation types provided.
    The output should an object with three arrays: "entity_types" which are the entity types detected in the question, "relation_types" which are the relation types detected in the question, and "values" which are the relevant entity names detected in the question.

    The output should look like this:

    {{"entity_types": ["Person", "Place"], "relation_types": ["LIVES_IN], "values": ["John Doe", "New York"]}}

    Do not include supporting information. Do not wrap the response in JSON markers. If there is no relevant information, just return an empty array.
    Question:{query}
    Entities:{entities}
    Relations:{relations}
"""
response = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "user", "content": prompt},
    ],
    max_tokens=4000,
    temperature=0.1
)

structured_query = json.loads(response.choices[0].message.content)

structured_query_response = whyhow_client.graphs.query_structured(
    entities=structured_query["entity_types"], 
    relations=structured_query["relation_types"], 
    values=structured_query["values"], 
    graph_id=graph.graph_id)

print(structured_query_response)
schema_id = whyhow_client.graphs.get(graph_id=graph.graph_id).schema_id

schema = whyhow_client.schemas.get(schema_id)

entities = [entity.name for entity in schema.entities]
relations = [relation.name for relation in schema.relations]


print(entities)
print(relations)

query = "What lawsuits is Amazon dealing with?"

# Using the entities and relations from the schema, you we will use an LLM to extract relevant entities and relations from a question and make a structured query, helping guarantee completeness
prompt = f"""
    Perform entity and relation extraction on the question below using the list of entity types and relation types provided.
    The output should an object with three arrays: "entity_types" which are the entity types detected in the question, "relation_types" which are the relation types detected in the question, and "values" which are the relevant entity names detected in the question.

    The output should look like this:

    {{"entity_types": ["Person", "Place"], "relation_types": ["LIVES_IN], "values": ["John Doe", "New York"]}}

    Do not include supporting information. Do not wrap the response in JSON markers. If there is no relevant information, just return an empty array.
    Question:{query}
    Entities:{entities}
    Relations:{relations}
"""
response = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "user", "content": prompt},
    ],
    max_tokens=4000,
    temperature=0.1
)

structured_query = json.loads(response.choices[0].message.content)

structured_query_response = whyhow_client.graphs.query_structured(
    entities=structured_query["entity_types"], 
    relations=structured_query["relation_types"], 
    values=structured_query["values"], 
    graph_id=graph.graph_id)

print(structured_query_response)

Run an unstructured query

In [ ]:

Copied!

unstructured_query_response = whyhow_client.graphs.query_unstructured(query=query, graph_id=graph.graph_id)

print(unstructured_query_response)
unstructured_query_response = whyhow_client.graphs.query_unstructured(query=query, graph_id=graph.graph_id)

print(unstructured_query_response)

Generate a final response by combining the results of the structured and unstructured queries.

In [ ]:

Copied!





prompt = f"""
    Using the supporting information from the natural language response and the structured triples below, provide a detailed answer to the question below:

    Question: {query}
    Natural Language Response: {unstructured_query_response.answer}
    Structured Triples: {structured_query_response}
"""
answer = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "user", "content": prompt},
    ],
    max_tokens=4000,
    temperature=0.1
)

print(answer.choices[0].message.content)
prompt = f"""
    Using the supporting information from the natural language response and the structured triples below, provide a detailed answer to the question below:

    Question: {query}
    Natural Language Response: {unstructured_query_response.answer}
    Structured Triples: {structured_query_response}
"""
answer = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "user", "content": prompt},
    ],
    max_tokens=4000,
    temperature=0.1
)

print(answer.choices[0].message.content)