Home Artificial Intelligence Demystifying LLM Apps with Lance Goal Data prep Embeddings Search Prompting Conclusions

Demystifying LLM Apps with Lance Goal Data prep Embeddings Search Prompting Conclusions

1
Demystifying LLM Apps with Lance
Goal
Data prep
Embeddings
Search
Prompting
Conclusions

what’s behind the scenes
from datasets import load_dataset
data = load_dataset('jamescalam/youtube-transcriptions', split='train')
Dataset({
features: ['title', 'published', 'url', 'video_id', 'channel_id', 'id', 'text', 'start', 'end'],
num_rows: 208619
})
import numpy as np
import pandas as pd

window = 20
stride = 4

def contextualize(raw_df, window, stride):
def process_video(vid):
# For every video, create the text rolling window
text = vid.text.values
time_end = vid["end"].values
contexts = vid.iloc[:-window:stride, :].copy()
contexts["text"] = [' '.join(text[start_i:start_i+window])
for start_i in range(0, len(vid)-window, stride)]
contexts["end"] = [time_end[start_i+window-1]
for start_i in range(0, len(vid)-window, stride)]
return contexts
# concat result from all videos
return pd.concat([process_video(vid) for _, vid in raw_df.groupby("title")])

df = contextualize(data.to_pandas(), 20, 4)

Me checking out pandas rolling windows calls text columns “nuisance”
>>> len(df)
48935
import openai
openai.Embedding.create(input="text", engine="text-embedding-ada-002")
import functools
import openai
import ratelimiter
from retry import retry

embed_model = "text-embedding-ada-002"

# API limit at 60/min == 1/sec
limiter = ratelimiter.RateLimiter(max_calls=0.9, period=1.0)

# Get the embedding with retry
@retry(tries=10, delay=1, max_delay=30, backoff=3, jitter=1)
def embed_func(c):
rs = openai.Embedding.create(input=c, engine=embed_model)
return [record["embedding"] for record in rs["data"]]

rate_limited = limiter(embed_func)

from tqdm.auto import tqdm
import math

# We request in batches fairly than 1 embedding at a time
def to_batches(arr, batch_size):
length = len(arr)
def _chunker(arr):
for start_i in range(0, len(df), batch_size):
yield arr[start_i:start_i+batch_size]
# add progress meter
yield from tqdm(_chunker(arr), total=math.ceil(length / batch_size))

batch_size = 1000
batches = to_batches(df.text.values.tolist(), batch_size)
embeds = [emb for c in batches for emb in rate_limited(c)]

import lance
import pyarrow as pa
from lance.vector import vec_to_table

table = vec_to_table(np.array(embeds))
combined = pa.Table.from_pandas(df).append_column("vector", table["vector"])
ds = lance.write_dataset(combined, "chatbot.lance")

ds.to_table(nearest={"column": "vector",
"q": [],
"k": }).to_pandas()
ds = ds.create_index("vector", index_type="IVF_PQ", 
num_partitions=64, num_sub_vectors=96)
Me talking about Lance

1. Embed the query text

query = ("Which training method should I exploit for sentence transformers "
"after I only have pairs of related sentences?")
openai.Embedding.create(input=query, engine="text-embedding-ada-002")

2. Seek for most similar context

context = ds.to_table(
nearest={
"column": "vector",
"k": 3,
"q": query_vector
}).to_pandas()

3. Create a prompt for the OpenAI completion API

        "Answer the query based on the context below.nn"+
"Context:n"
def complete(prompt):
# query text-davinci-003
res = openai.Completion.create(
engine='text-davinci-003',
prompt=prompt,
temperature=0,
max_tokens=400,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
stop=None
)
return res['choices'][0]['text'].strip()
>> query = "who was the twelfth person on the moon and when did they land?"
>> complete(query)
'The twelfth person on the moon was Harrison Schmitt, and he landed on December 11, 1972.'

Putting all of it together

1 COMMENT

LEAVE A REPLY

Please enter your comment!
Please enter your name here