I have a sample meeting transcript txt file and I want to generate meeting notes out of it,
I am using langchain summarization chain to do this and using the bloom
model to use open source llm for the task
This is the code-
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
checkpoint = "bigscience/bloom-560m"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint)
transcript_file = "/content/transcript/transcript.txt"
with open(transcript_file, encoding='latin-1') as file:
documents = file.read()
text_splitter = CharacterTextSplitter(
chunk_size=3000,
chunk_overlap=200,
length_function=len
)
texts = text_splitter.split_text(documents)
docs = [Document(page_content=t) for t in texts[:]]
target_len = 500
prompt_template = """Act as a professional technical meeting minutes writer.
Tone: formal
Format: Technical meeting summary
Tasks:
- Highlight action items and owners
- Highlight the agreements
- Use bullet points if needed
{text}
CONCISE SUMMARY IN ENGLISH:"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
refine_template = (
"Your job is to produce a final summary\n"
"We have provided an existing summary up to a certain point: {existing_answer}\n"
"We have the opportunity to refine the existing summary"
"(only if needed) with some more context below.\n"
"------------\n"
"{text}\n"
"------------\n"
f"Given the new context, refine the original summary in English within {target_len} words: following the format"
"Participants: <participants>"
"Discussed: <Discussed-items>"
"Follow-up actions: <a-list-of-follow-up-actions-with-owner-names>"
"If the context isn't useful, return the original summary. Highlight agreements and follow-up actions and owners."
)
refine_prompt = PromptTemplate(
input_variables=["existing_answer", "text"],
template=refine_template,
)
chain = load_summarize_chain(
model=model,
chain_type="refine",
return_intermediate_steps=True,
question_prompt=PROMPT,
refine_prompt=refine_prompt
)
result = chain({"input_documents": docs}, return_only_outputs=True)
I get the error as -
ValidationError: 1 validation error for LLMChain
llm
value is not a valid dict (type=type_error.dict)
I do not understand where I am going wrong. Please advise.
Right I think the clue is in here - https://integrations.langchain.com/llms As you can see Bloom is not on the supported list, so i think you have to go through the HuggingFacePipeline wrapper.
This is the code I am trying for GPT2
model=AutoModelForCausalLM.from_pretrained(model_name)
# Load the hugging face pipeline as GPT2 is not directly supported
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=256, temperature=0.7, top_p=0.95, repetition_penalty=1.15)
local_llm = HuggingFacePipeline(pipeline=pipeline)
# ConversationalRetrievalChain is for keeping memory history.
chain = ConversationalRetrievalChain.from_llm(llm=local_llm, retriever=vectorstore.as_retriever(search_kwargs={"k": 1}), chain_type="stuff")
model_name="gpt2-medium" and I've omitted all the vector database set up as I don't think it is relevant to your example
I hope this helps but apologies if not as I am a bit of a newbie to this myself