my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	import json
#2	import os
#3	import time
#4	from collections import defaultdict
#5
#6	import numpy as np
#7	import tiktoken
#8	from dotenv import load_dotenv
#9	from jinja2 import Template
#10	from openai import OpenAI
#11	from tqdm import tqdm
#12
#13	load_dotenv()
#14
#15	PROMPT = """
#16	# Question:
#17	{{QUESTION}}
#18
#19	# Context:
#20	{{CONTEXT}}
#21
#22	# Short answer:
#23	"""
#24
#25
#26	class RAGManager:
#27	def __init__(self, data_path="dataset/locomo10_rag.json", chunk_size=500, k=1):
#28	self.model = os.getenv("MODEL")
#29	self.client = OpenAI()
#30	self.data_path = data_path
#31	self.chunk_size = chunk_size
#32	self.k = k
#33
#34	def generate_response(self, question, context):
#35	template = Template(PROMPT)
#36	prompt = template.render(CONTEXT=context, QUESTION=question)
#37
#38	max_retries = 3
#39	retries = 0
#40
#41	while retries <= max_retries:
#42	try:
#43	t1 = time.time()
#44	response = self.client.chat.completions.create(
#45	model=self.model,
#46	messages=[
#47	{
#48	"role": "system",
#49	"content": "You are a helpful assistant that can answer "
#50	"questions based on the provided context."
#51	"If the question involves timing, use the conversation date for reference."
#52	"Provide the shortest possible answer."
#53	"Use words directly from the conversation when possible."
#54	"Avoid using subjects in your answer.",
#55	},
#56	{"role": "user", "content": prompt},
#57	],
#58	temperature=0,
#59	)
#60	t2 = time.time()
#61	return response.choices[0].message.content.strip(), t2 - t1
#62	except Exception as e:
#63	retries += 1
#64	if retries > max_retries:
#65	raise e
#66	time.sleep(1) # Wait before retrying
#67
#68	def clean_chat_history(self, chat_history):
#69	cleaned_chat_history = ""
#70	for c in chat_history:
#71	cleaned_chat_history += f"{c['timestamp']} \| {c['speaker']}: {c['text']}\n"
#72
#73	return cleaned_chat_history
#74
#75	def calculate_embedding(self, document):
#76	response = self.client.embeddings.create(model=os.getenv("EMBEDDING_MODEL"), input=document)
#77	return response.data[0].embedding
#78
#79	def calculate_similarity(self, embedding1, embedding2):
#80	return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
#81
#82	def search(self, query, chunks, embeddings, k=1):
#83	"""
#84	Search for the top-k most similar chunks to the query.
#85
#86	Args:
#87	query: The query string
#88	chunks: List of text chunks
#89	embeddings: List of embeddings for each chunk
#90	k: Number of top chunks to return (default: 1)
#91
#92	Returns:
#93	combined_chunks: The combined text of the top-k chunks
#94	search_time: Time taken for the search
#95	"""
#96	t1 = time.time()
#97	query_embedding = self.calculate_embedding(query)
#98	similarities = [self.calculate_similarity(query_embedding, embedding) for embedding in embeddings]
#99
#100	# Get indices of top-k most similar chunks
#101	if k == 1:
#102	# Original behavior - just get the most similar chunk
#103	top_indices = [np.argmax(similarities)]
#104	else:
#105	# Get indices of top-k chunks
#106	top_indices = np.argsort(similarities)[-k:][::-1]
#107
#108	# Combine the top-k chunks
#109	combined_chunks = "\n<->\n".join([chunks[i] for i in top_indices])
#110
#111	t2 = time.time()
#112	return combined_chunks, t2 - t1
#113
#114	def create_chunks(self, chat_history, chunk_size=500):
#115	"""
#116	Create chunks using tiktoken for more accurate token counting
#117	"""
#118	# Get the encoding for the model
#119	encoding = tiktoken.encoding_for_model(os.getenv("EMBEDDING_MODEL"))
#120
#121	documents = self.clean_chat_history(chat_history)
#122
#123	if chunk_size == -1:
#124	return [documents], []
#125
#126	chunks = []
#127
#128	# Encode the document
#129	tokens = encoding.encode(documents)
#130
#131	# Split into chunks based on token count
#132	for i in range(0, len(tokens), chunk_size):
#133	chunk_tokens = tokens[i : i + chunk_size]
#134	chunk = encoding.decode(chunk_tokens)
#135	chunks.append(chunk)
#136
#137	embeddings = []
#138	for chunk in chunks:
#139	embedding = self.calculate_embedding(chunk)
#140	embeddings.append(embedding)
#141
#142	return chunks, embeddings
#143
#144	def process_all_conversations(self, output_file_path):
#145	with open(self.data_path, "r") as f:
#146	data = json.load(f)
#147
#148	FINAL_RESULTS = defaultdict(list)
#149	for key, value in tqdm(data.items(), desc="Processing conversations"):
#150	chat_history = value["conversation"]
#151	questions = value["question"]
#152
#153	chunks, embeddings = self.create_chunks(chat_history, self.chunk_size)
#154
#155	for item in tqdm(questions, desc="Answering questions", leave=False):
#156	question = item["question"]
#157	answer = item.get("answer", "")
#158	category = item["category"]
#159
#160	if self.chunk_size == -1:
#161	context = chunks[0]
#162	search_time = 0
#163	else:
#164	context, search_time = self.search(question, chunks, embeddings, k=self.k)
#165	response, response_time = self.generate_response(question, context)
#166
#167	FINAL_RESULTS[key].append(
#168	{
#169	"question": question,
#170	"answer": answer,
#171	"category": category,
#172	"context": context,
#173	"response": response,
#174	"search_time": search_time,
#175	"response_time": response_time,
#176	}
#177	)
#178	with open(output_file_path, "w+") as f:
#179	json.dump(FINAL_RESULTS, f, indent=4)
#180
#181	# Save results
#182	with open(output_file_path, "w+") as f:
#183	json.dump(FINAL_RESULTS, f, indent=4)
#184

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public