openai-api langchain azure-openai openaiembeddings retrieval-augmented-generation

TypeError: expected string or buffer - Langchain, OpenAI Embeddings

I am trying to create RAG using the product manuals in pdf which are splitted, indexed and stored in Chroma persisted on a disk. When I try the function that classifies the reviews using the documents context, below is the error I get:


from langchain import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
from langchain.vectorstores import Chroma

llm = AzureChatOpenAI(
        azure_deployment="ChatGPT-16K",
        openai_api_version="2023-05-15",
        azure_endpoint=endpoint,
        api_key=result["access_token"],
        temperature=0,
        seed = 100
    )

embedding_model = AzureOpenAIEmbeddings(
    api_version="2023-05-15",
    azure_endpoint=endpoint,
    api_key=result["access_token"],
    azure_deployment="ada002",
)

vectordb = Chroma(
    persist_directory=vector_db_path,
    embedding_function=embedding_model,
    collection_name="product_manuals",
)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def classify (review_title, review_text, product_num):

    template = """
        
    You are a customer service AI Assistant that handles responses to negative product reviews. 

       Use the context below and categorize {review_title} and {review_text} into defect, misuse or poor quality categories based only on provided context. If you don't know, say that you do not know, don't try to make up an answer. Respond back with an answer in the following format:

        poor quality
        misuse
        defect

        {context}
            
    Category: 
    """


    rag_prompt = PromptTemplate.from_template(template)
    
    retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={'filter': {'product_num': product_num}})


    retrieval_chain = (
            {"context": retriever | format_docs, "review_title: RunnablePassthrough(), "review_text": RunnablePassthrough()}
            | rag_prompt
            | llm
            | StrOutputParser()
    )
    return retrieval_chain.invoke({"review_title": review_title, "review_text": review_text})

classify(review_title="Terrible", review_text ="This baking sheet is terrible. It stains so easily and i've tried everything to get it clean", product_num ="8888999")

Error stack:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File <command-3066972537097411>, line 1
----> 1 issue_recommendation(
      2     review_title="Terrible",
      3     review_text="This baking sheet is terrible. It stains so easily and i've tried everything to get it clean. I've maybe used it 5 times and it looks like it's 20 years old. The side of the pan also hold water, so when you pick it up off the drying rack, water runs out. I would never purchase these again.",
      4     product_num="8888999"
      5    
      6 )

File <command-3066972537097410>, line 44, in issue_recommendation(review_title, review_text, product_num)
     36 retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={'filter': {'product_num': product_num}})
     38 retrieval_chain = (
     39         {"context": retriever | format_docs, "review_text": RunnablePassthrough()}
     40         | rag_prompt
     41         | llm
     42         | StrOutputParser()
     43 )
---> 44 return retrieval_chain.invoke({"review_title":review_title, "review_text": review_text})

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-65a09d8c-062d-4f4f-9c52-1bf534f6511e/lib/python3.10/site-packages/langchain_core/runnables/base.py:1762, in RunnableSequence.invoke(self, input, config)
   1760 try:
   1761     for i, step in enumerate(self.steps):
-> 1762         input = step.invoke(
   1763             input,
   1764             # mark each step as a child run
   1765             patch_config(
   1766                 config, callbacks=run_manager.get_child(f"seq:step:{i+1}")
   1767             ),
   1768         )
   1769 # finish the root run
   1770 except BaseException as e:

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-65a09d8c-062d-4f4f-9c52-1bf534f6511e/lib/python3.10/site-packages/langchain_core/runnables/base.py:2327, in RunnableParallel.invoke(self, input, config)
   2314     with get_executor_for_config(config) as executor:
   2315         futures = [
   2316             executor.submit(
   2317                 step.invoke,
   (...)
   2325             for key, step in steps.items()
   2326         ]
-> 2327         output = {key: future.result() for key, future in zip(steps, futures)}
   2328 # finish the root run
   2329 except BaseException as e:

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-65a09d8c-062d-4f4f-9c52-1bf534f6511e/lib/python3.10/site-packages/langchain_core/runnables/base.py:2327, in <dictcomp>(.0)
   2314     with get_executor_for_config(config) as executor:
   2315         futures = [
   2316             executor.submit(
   2317                 step.invoke,
   (...)
   2325             for key, step in steps.items()
   2326         ]
-> 2327         output = {key: future.result() for key, future in zip(steps, futures)}
   2328 # finish the root run
   2329 except BaseException as e:

File /usr/lib/python3.10/concurrent/futures/_base.py:451, in Future.result(self, timeout)
    449     raise CancelledError()
    450 elif self._state == FINISHED:
--> 451     return self.__get_result()
    453 self._condition.wait(timeout)
    455 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:

File /usr/lib/python3.10/concurrent/futures/_base.py:403, in Future.__get_result(self)
    401 if self._exception:
    402     try:
--> 403         raise self._exception
    404     finally:
    405         # Break a reference cycle with the exception in self._exception
    406         self = None

File /usr/lib/python3.10/concurrent/futures/thread.py:58, in _WorkItem.run(self)
     55     return
     57 try:
---> 58     result = self.fn(*self.args, **self.kwargs)
     59 except BaseException as exc:
     60     self.future.set_exception(exc)

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-65a09d8c-062d-4f4f-9c52-1bf534f6511e/lib/python3.10/site-packages/langchain_core/runnables/base.py:1762, in RunnableSequence.invoke(self, input, config)
   1760 try:
   1761     for i, step in enumerate(self.steps):
-> 1762         input = step.invoke(
   1763             input,
   1764             # mark each step as a child run
   1765             patch_config(
   1766                 config, callbacks=run_manager.get_child(f"seq:step:{i+1}")
   1767             ),
   1768         )
   1769 # finish the root run
   1770 except BaseException as e:

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-65a09d8c-062d-4f4f-9c52-1bf534f6511e/lib/python3.10/site-packages/langchain_core/retrievers.py:121, in BaseRetriever.invoke(self, input, config)
    117 def invoke(
    118     self, input: str, config: Optional[RunnableConfig] = None
    119 ) -> List[Document]:
    120     config = ensure_config(config)
--> 121     return self.get_relevant_documents(
    122         input,
    123         callbacks=config.get("callbacks"),
    124         tags=config.get("tags"),
    125         metadata=config.get("metadata"),
    126         run_name=config.get("run_name"),
    127     )

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-65a09d8c-062d-4f4f-9c52-1bf534f6511e/lib/python3.10/site-packages/langchain_core/retrievers.py:223, in BaseRetriever.get_relevant_documents(self, query, callbacks, tags, metadata, run_name, **kwargs)
    221 except Exception as e:
    222     run_manager.on_retriever_error(e)
--> 223     raise e
    224 else:
    225     run_manager.on_retriever_end(
    226         result,
    227         **kwargs,
    228     )

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-65a09d8c-062d-4f4f-9c52-1bf534f6511e/lib/python3.10/site-packages/langchain_core/retrievers.py:216, in BaseRetriever.get_relevant_documents(self, query, callbacks, tags, metadata, run_name, **kwargs)
    214 _kwargs = kwargs if self._expects_other_args else {}
    215 if self._new_arg_supported:
--> 216     result = self._get_relevant_documents(
    217         query, run_manager=run_manager, **_kwargs
    218     )
    219 else:
    220     result = self._get_relevant_documents(query, **_kwargs)

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-65a09d8c-062d-4f4f-9c52-1bf534f6511e/lib/python3.10/site-packages/langchain_core/vectorstores.py:654, in VectorStoreRetriever._get_relevant_documents(self, query, run_manager)
    650 def _get_relevant_documents(
    651     self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    652 ) -> List[Document]:
    653     if self.search_type == "similarity":
--> 654         docs = self.vectorstore.similarity_search(query, **self.search_kwargs)
    655     elif self.search_type == "similarity_score_threshold":
    656         docs_and_similarities = (
    657             self.vectorstore.similarity_search_with_relevance_scores(
    658                 query, **self.search_kwargs
    659             )
    660         )

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-65a09d8c-062d-4f4f-9c52-1bf534f6511e/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py:348, in Chroma.similarity_search(self, query, k, filter, **kwargs)
    331 def similarity_search(
    332     self,
    333     query: str,
   (...)
    336     **kwargs: Any,
    337 ) -> List[Document]:
    338     """Run similarity search with Chroma.
    339 
    340     Args:
   (...)
    346         List[Document]: List of documents most similar to the query text.
    347     """
--> 348     docs_and_scores = self.similarity_search_with_score(
    349         query, k, filter=filter, **kwargs
    350     )
    351     return [doc for doc, _ in docs_and_scores]

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-65a09d8c-062d-4f4f-9c52-1bf534f6511e/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py:437, in Chroma.similarity_search_with_score(self, query, k, filter, where_document, **kwargs)
    429     results = self.__query_collection(
    430         query_texts=[query],
    431         n_results=k,
   (...)
    434         **kwargs,
    435     )
    436 else:
--> 437     query_embedding = self._embedding_function.embed_query(query)
    438     results = self.__query_collection(
    439         query_embeddings=[query_embedding],
    440         n_results=k,
   (...)
    443         **kwargs,
    444     )
    446 return _results_to_docs_and_scores(results)

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-65a09d8c-062d-4f4f-9c52-1bf534f6511e/lib/python3.10/site-packages/langchain_community/embeddings/openai.py:691, in OpenAIEmbeddings.embed_query(self, text)
    682 def embed_query(self, text: str) -> List[float]:
    683     """Call out to OpenAI's embedding endpoint for embedding query text.
    684 
    685     Args:
   (...)
    689         Embedding for the text.
    690     """
--> 691     return self.embed_documents([text])[0]

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-65a09d8c-062d-4f4f-9c52-1bf534f6511e/lib/python3.10/site-packages/langchain_community/embeddings/openai.py:662, in OpenAIEmbeddings.embed_documents(self, texts, chunk_size)
    659 # NOTE: to keep things simple, we assume the list may contain texts longer
    660 #       than the maximum context and use length-safe embedding function.
    661 engine = cast(str, self.deployment)
--> 662 return self._get_len_safe_embeddings(texts, engine=engine)

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-65a09d8c-062d-4f4f-9c52-1bf534f6511e/lib/python3.10/site-packages/langchain_community/embeddings/openai.py:465, in OpenAIEmbeddings._get_len_safe_embeddings(self, texts, engine, chunk_size)
    459 if self.model.endswith("001"):
    460     # See: https://github.com/openai/openai-python/
    461     #      issues/418#issuecomment-1525939500
    462     # replace newlines, which can negatively affect performance.
    463     text = text.replace("\n", " ")
--> 465 token = encoding.encode(
    466     text=text,
    467     allowed_special=self.allowed_special,
    468     disallowed_special=self.disallowed_special,
    469 )
    471 # Split tokens into chunks respecting the embedding_ctx_length
    472 for j in range(0, len(token), self.embedding_ctx_length):

File /databricks/python/lib/python3.10/site-packages/tiktoken/core.py:116, in Encoding.encode(self, text, allowed_special, disallowed_special)
    114     if not isinstance(disallowed_special, frozenset):
    115         disallowed_special = frozenset(disallowed_special)
--> 116     if match := _special_token_regex(disallowed_special).search(text):
    117         raise_disallowed_special_token(match.group())
    119 try:

TypeError: expected string or buffer

Embeddings seems to work fine when I test. It also works fine when I remove the context and retriever from the chain. It seems to be related to embeddings. Examples on Langchain website instantiates retriver from Chroma.from_documents() whereas I load Chroma vector store from a persisted path. I also tried invoking with review_text only (instead of review title and review text) but the error persists. Not sure why this is happening. These are the package versions I work:

Name: openai Version: 1.6.1

Name: langchain Version: 0.0.354

Solution

I've come across the same issue, and turned out that langchain pass a key-value pair as an input to the encoding.code() while it requires str type. A work around is by using itemgetter() to get the direct string input. It might be something like this

        retrieval_chain = (
            {
                "document": itemgetter("question") | self.retriever,
                "question": itemgetter("question"),
            }
            | prompt
            | model
            | StrOutputParser()
        )

You can find the reference here