๋ฐ์ํ
๐ก OpenAI Embedding์ ์ฌ์ฉํ์ฌ ์ฌ์ฉ์ ์ ๋ ฅ๊ณผ ๊ฐ์ฅ ์ ์ฌํ ๋ฌธ์๋ฅผ ๊ฒ์ํ๋ ์๋ฒ๋ฅผ ๊ตฌ์ถํ๋ค.
โ๏ธ ์ด์ ๋ฒ์ : https://doteloper.tistory.com/114
Flow
๊ฐ๋ฐํ๊ฒฝ
- ๋ชจ๋ธ: OpenAI Embedding - text-embedding-3-small
- ๋ฒกํฐ DB: elastic search
- flask / python3
- ์ฐธ๊ณ : openai cookbook
Embedding ์์ฑ
openai package๋ฅผ ์ฌ์ฉํ์ฌ Embedding ๊ฐ ์์ฑ (document)
client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY")
)
def get_embedding(text, model="text-embedding-3-small"):
text = text.replace("\\n", " ")
return client.embeddings.create(input=[text], model=model).data[0].embedding
- `text-embedding-3-small`
- `text-embedding-ada-002` ๋ชจ๋ธ๋ณด๋ค ์ฑ๋ฅ์ด ํฌ๊ฒ ํฅ์๋ ์๊ณ ํจ์จ์ ์ธ ํ ์คํธ ์๋ฒ ๋ฉ ๋ชจ๋ธ
- ์ฑ๋ฅ ํฅ์: ์ด ๋ชจ๋ธ์ ์ด์ ๋ชจ๋ธ(text-embedding-ada-002)๋ณด๋ค MIRACL(Multi-Language Retrieval) ๋ฒค์น๋งํฌ์์ ํ๊ท ์ ์๊ฐ 31.4%์์ 44.0%๋ก, MTEB(English Tasks) ๋ฒค์น๋งํฌ์์๋ 61.0%์์ 62.3%๋ก ํฅ์
- ๊ฐ๊ฒฉ ์ธํ: text-embedding-3-small ๋ชจ๋ธ์ ๊ฐ๊ฒฉ์ ์ด์ ๋ชจ๋ธ์ ๋นํด 5๋ฐฐ ์ ๋ ดํ $0.00002๋ก ์ค์
Semantic Search
Create index
Embedding ๊ฐ ๋ฐ ์ ๋ณด (question ๋ฐ answer)์ ์ ์ฅํ index ์์ฑ
def create_index_es():
index_mapping = {
"properties": {
"question": {
"type": "text",
},
"answer": {
"type": "text",
},
"content_vector": {
"type": "dense_vector",
"dims": 1536,
"index": "true",
"similarity": "cosine"
}
}
}
es.indices.create(index="faq-index", mappings=index_mapping)
- faq๋ฅผ ์ ์ฅํ `question` , `answer` ํ๋์ ํด๋น question์ embedding ๊ฐ์ ์ ์ฅํ `content_vector` ํ๋ ์์ฑ
- ์ด๋, embedding ๊ฐ์ ์ ์ฅํ ํ๋๋ `dense_vector` ํ์ ์ผ๋ก ์ง์ ํด์ฃผ์ด์ผ ํ๋ค. ์ด ํ์ ์ kNN search์ ์ฃผ๋ก ์ฌ์ฉ๋๋ค. ๊ทธ๋ ๊ธฐ ๋๋ฌธ์, `aggregations`์ `sorting`์ด ์ง์๋์ง ์๋๋ค.
- `dims` ๋ OpenAI Embedding์ dimension๊ฐ์ธ 1536๋ก ์ง์
- index ๋ง์ฝ kNN search๋ฅผ ์ฌ์ฉํ๋ ค๋ฉด ์ด๋ฅผ true๋ก ์ง์ ํด์ฃผ์ด์ผ ํ๋ค!
- ์ด๋ฅผ true๋ก ์ง์ ํด์ฃผ์๋ค๋ฉด, ์๋ similarity ํ์ ์ ์ค์ ํด์ฃผ์ด์ผ ํ๋ค. ์ด๋ kNN ๊ฒ์์์ ์ฌ์ฉํ ์ ์ฌ์ฑ ์ธก์ ํญ๋ชฉ์ผ๋ก, `l2_norm`, `dot_product`, `cosine` ์ด ์ธ๊ฐ์ง ๊ฐ์ด ์ฌ์ฉ๋ ์ ์๋ค.
- ํด๋น ํ๋ก์ ํธ์์ ์ ๋ณด ๊ฒ์ ์์คํ ์์ ๊ฒ์ ์ฟผ๋ฆฌ์ ๋ฌธ์ ์ฌ์ด์ ์ ์ฌ์ฑ์ ํ๊ฐํ๊ณ , ๊ฐ์ฅ ๊ด๋ จ์ฑ ๋์ ๋ฌธ์๋ฅผ ๊ฒ์ ๊ฒฐ๊ณผ๋ก ๋ฐํํ๋ ๋ฐ ์ฌ์ฉ๋๋ `cosine similarity`์ ์ฌ์ฉํ์๋ค.
Index document
Embedding๊ฐ์ Vector DB์ ์ ์ฅ
es = Elasticsearch("http://localhost:9200")
def index_document(qusetion, answer, embedding):
es.index(
index='faq-index',
body={
'question': question,
'answer': answer,
'content_vector': embedding,
}
)
ํด๋น ํจ์๋ฅผ ํตํด index์ document๋ฅผ ์ ์ฅ
Search document
Embedding ๊ฐ์ผ๋ก VectorDB์์์ semantic search ์ํ
es = Elasticsearch("http://localhost:9200")
def search_similarity(user_embedding):
similar_docs = es.search(
index='faq-index',
body={
"query": {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'content_vector') + 1.0",
"params": {"query_vector": user_embedding}
}
}
},
"_source": ["question", "answer"],
"size": 1
}
)
# ๊ฐ์ฅ ์ ์ฌํ document
hit_document = similar_docs['hits']['hits'][0]
# document์ ์ ์ฌ๋
score = hit_document['_score']
print(score)
# ๊ฐ์ฅ ์ ์ฌํ document์ ๋ต๋ณ ์ถ๋ ฅ
return hit_document['_source']['answer']
- `script_score`: ์คํฌ๋ฆฝํธ๋ฅผ ์ฌ์ฉํ์ฌ ์ ์๋ฅผ ๊ณ์ฐํ๋ ์ฟผ๋ฆฌ
- `source`: ์คํฌ๋ฆฝํธ ๋ด์์ `cosineSimilarity` ํจ์๋ฅผ ํธ์ถํ์ฌ ์ ์ฌ์ฑ ๊ณ์ฐ
- ์ด๋ `params.query_vector`๋ ์ฌ์ฉ์๊ฐ ์ ๊ณตํ ๋ฒกํฐ์ด๋ฉฐ, `content_vector`๋ ์ธ๋ฑ์ค ๋ด ๋ฌธ์์ ๋ฒกํฐ ํ๋์ด๋ค.
- `_source` ์ `size` ํ๋์ ์ค์ ์ผ๋ก ๊ฐ์ฅ ์ ์ฌํ ๋ฌธ์ 1๊ฐ์ question๊ณผ answer ํ๋๋ง ๋ฐํ
์ ์ฒด ์ฝ๋
๋๋ณด๊ธฐ
from elasticsearch import Elasticsearch
from dotenv import load_dotenv
from flask import Flask, request, jsonify
import os
from openai import OpenAI
app = Flask(__name__)
load_dotenv()
es = Elasticsearch("http://localhost:9200")
client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY")
)
def get_embedding(text, model="text-embedding-3-small"):
text = text.replace("\\n", " ")
return client.embeddings.create(input=[text], model=model).data[0].embedding
def search_similarity(user_embedding):
similar_docs = es.search(
index='faq-index',
body={
"query": {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'content_vector') + 1.0",
"params": {"query_vector": user_embedding}
}
}
},
"_source": ["question", "answer"],
"size": 1
}
)
# ๊ฐ์ฅ ์ ์ฌํ document
hit_document = similar_docs['hits']['hits'][0]
# document์ ์ ์ฌ๋
score = hit_document['_score']
print(score)
# ๊ฐ์ฅ ์ ์ฌํ document์ ๋ต๋ณ ์ถ๋ ฅ
return hit_document['_source']['answer']
def index_document(question, answer, embedding):
es.index(
index='faq-index',
body={
'question': question,
'answer': answer,
'content_vector': embedding,
}
)
def chat_with_bot(user_message):
# ์ฌ์ฉ์ ๋ฉ์์ง ์๋ฒ ๋ฉ ์์ฑ
user_embedding = get_embedding(user_message)
# Elasticsearch์์ ์ ์ฌํ ๋ฌธ์ ๊ฒ์
similarity = search_similarity(user_embedding)
return similarity
def embed_and_store_cases(question, answer):
# ์๋ฒ ๋ฉ ์์ฑ
embedding = get_embedding(question)
# ์๋ฒ ๋ฉ ์ ์ฅ
index_document(question, answer, embedding)
def create_es_index():
index_mapping = {
"properties": {
"question": {
"type": "text",
},
"answer": {
"type": "text",
},
"content_vector": {
"type": "dense_vector",
"dims": 1536,
"index": "true",
"similarity": "cosine"
}
}
}
es.indices.create(index="faq-index", mappings=index_mapping)
@app.route('/answer', methods=['POST'])
def get_answer():
user_message = request.json['user_message']
# ์ฑ๋ด ๋ก์ง ํธ์ถ
response_messages = chat_with_bot(user_message)
return jsonify({"answer": response_messages})
@app.route('/store', methods=['POST'])
def store_knowledge():
question = request.json['question']
answer = request.json['answer']
embed_and_store_cases(question, answer)
return jsonify({"result": "OK"})
@app.route('/create', methods=['POST'])
def create_index():
create_es_index()
return jsonify({"result": "OK"})
if __name__ == '__main__':
app.run()
์ต์ข ๊ฒฐ๊ณผ
๊ฐ์ Tasks
- elastic search search ์ฟผ๋ฆฌ ํ๋ (knn ๋ฑ..)
- score๋ฅผ ํ์ฉํ ์ ์ฌ๋๋ฅผ ์ฌ์ฉ์์๊ฒ ํน์ ๋ถ์์ฉ์ผ๋ก ์ ๊ณต
โ๏ธ์ฝ๋๋ ์๋์์โ๏ธ
https://github.com/jeongum/openai-embedding
GitHub - jeongum/openai-embedding: open ai embedding ์ค์ต
open ai embedding ์ค์ต. Contribute to jeongum/openai-embedding development by creating an account on GitHub.
github.com
๋ฐ์ํ
'๐ป ๊ฐ๋ฐ ์ผ์ง' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
[kotlin/์ฝํ๋ฆฐ ์ฝ๋ฃจํด์ ์ ์] 5์ฅ. async์ Deferred (0) | 2024.05.21 |
---|---|
OpenAI Embedding์ ์ฌ์ฉํ ์ ์ฌ ๋ฌธ์ ๊ฒ์ (Flask, ElasticSearch) (0) | 2023.10.22 |