Demystifying LLM Apps with Lance Goal Data prep Embeddings Search Prompting Conclusions

-

what’s behind the scenes
from datasets import load_dataset
data = load_dataset('jamescalam/youtube-transcriptions', split='train')
Dataset({
features: ['title', 'published', 'url', 'video_id', 'channel_id', 'id', 'text', 'start', 'end'],
num_rows: 208619
})
import numpy as np
import pandas as pd

window = 20
stride = 4

def contextualize(raw_df, window, stride):
def process_video(vid):
# For every video, create the text rolling window
text = vid.text.values
time_end = vid["end"].values
contexts = vid.iloc[:-window:stride, :].copy()
contexts["text"] = [' '.join(text[start_i:start_i+window])
for start_i in range(0, len(vid)-window, stride)]
contexts["end"] = [time_end[start_i+window-1]
for start_i in range(0, len(vid)-window, stride)]
return contexts
# concat result from all videos
return pd.concat([process_video(vid) for _, vid in raw_df.groupby("title")])

df = contextualize(data.to_pandas(), 20, 4)

Me checking out pandas rolling windows calls text columns “nuisance”
>>> len(df)
48935
import openai
openai.Embedding.create(input="text", engine="text-embedding-ada-002")
import functools
import openai
import ratelimiter
from retry import retry

embed_model = "text-embedding-ada-002"

# API limit at 60/min == 1/sec
limiter = ratelimiter.RateLimiter(max_calls=0.9, period=1.0)

# Get the embedding with retry
@retry(tries=10, delay=1, max_delay=30, backoff=3, jitter=1)
def embed_func(c):
rs = openai.Embedding.create(input=c, engine=embed_model)
return [record["embedding"] for record in rs["data"]]

rate_limited = limiter(embed_func)

from tqdm.auto import tqdm
import math

# We request in batches fairly than 1 embedding at a time
def to_batches(arr, batch_size):
length = len(arr)
def _chunker(arr):
for start_i in range(0, len(df), batch_size):
yield arr[start_i:start_i+batch_size]
# add progress meter
yield from tqdm(_chunker(arr), total=math.ceil(length / batch_size))

batch_size = 1000
batches = to_batches(df.text.values.tolist(), batch_size)
embeds = [emb for c in batches for emb in rate_limited(c)]

import lance
import pyarrow as pa
from lance.vector import vec_to_table

table = vec_to_table(np.array(embeds))
combined = pa.Table.from_pandas(df).append_column("vector", table["vector"])
ds = lance.write_dataset(combined, "chatbot.lance")

ds.to_table(nearest={"column": "vector",
"q": [],
"k": }).to_pandas()
ds = ds.create_index("vector", index_type="IVF_PQ", 
num_partitions=64, num_sub_vectors=96)
Me talking about Lance

1. Embed the query text

query = ("Which training method should I exploit for sentence transformers "
"after I only have pairs of related sentences?")
openai.Embedding.create(input=query, engine="text-embedding-ada-002")

2. Seek for most similar context

context = ds.to_table(
nearest={
"column": "vector",
"k": 3,
"q": query_vector
}).to_pandas()

3. Create a prompt for the OpenAI completion API

        "Answer the query based on the context below.nn"+
"Context:n"
def complete(prompt):
# query text-davinci-003
res = openai.Completion.create(
engine='text-davinci-003',
prompt=prompt,
temperature=0,
max_tokens=400,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
stop=None
)
return res['choices'][0]['text'].strip()
>> query = "who was the twelfth person on the moon and when did they land?"
>> complete(query)
'The twelfth person on the moon was Harrison Schmitt, and he landed on December 11, 1972.'

Putting all of it together

ASK DUKE

What are your thoughts on this topic?
Let us know in the comments below.

1 COMMENT

0 0 votes
Article Rating
guest
1 Comment
Oldest
Newest Most Voted
Inline Feedbacks
View all comments

Share this article

Recent posts

1
0
Would love your thoughts, please comment.x
()
x