from azure.identity import DefaultAzureCredential, get_bearer_token_provider from openai import AzureOpenAI import pymongo import json import asyncio mongo_conn = "mongodb+srv://GhostXAdmin:Datascience123@ghostxcosmoscluster.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000" def get_azure_openai_token(): """Retrieve Azure OpenAI authentication token via Managed Identity.""" credential = DefaultAzureCredential() token = credential.get_token("https://cognitiveservices.azure.com/.default") return token.token token_provider = get_bearer_token_provider( DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default" ) # Azure OpenAI Configuration AZURE_OPENAI_API_KEY = get_azure_openai_token() AZURE_OPENAI_ENDPOINT = "https://aoai-glad.openai.azure.com/" # https://aoai-glad.openai.azure.com/Replace with your Azure endpoint AZURE_DEPLOYMENT_NAME = "gpt-4o-SK" # Replace with your Azure deployment name AZURE_API_VERSION = "2024-05-01-preview" # Use the correct API version for your setup client = AzureOpenAI( azure_endpoint=AZURE_OPENAI_ENDPOINT , azure_ad_token_provider=token_provider, api_version="2024-05-01-preview", ) def generate_embeddings(text): ''' Generate embeddings from string of text. This will be used to vectorize data and user input for interactions with Azure OpenAI. ''' response = client.embeddings.create( input=text, model="text-embedding-ada-002") #print(response) embeddings = response.data[0].embedding return embeddings DATABASE_NAME = "TestDB" COLLECTION_NAME = "newCol" mongo_client = pymongo.MongoClient(mongo_conn) db = mongo_client[DATABASE_NAME] collection = db[COLLECTION_NAME] def insertdata(): data_file = open(file="./sample_data.json", mode="r") data = json.load(data_file) data_file.close() newdata = {} jsondata = [] item_count = 0 for item in data: embedding = generate_embeddings(item["content"]) newdata["id"] = item["id"] newdata["title"] = item["title"] newdata["content"] = item["content"] newdata["vector"] = embedding jsondata.append(newdata.copy()) item_count = item_count+1 collection.insert_many(jsondata) async def main(): collection.create_index( [("vector", "cosmosSearch")], cosmosSearchOptions={ "kind": "vector-ivf", "numLists": 800, "similarity": "COS", "dimensions": 1536 } ) indexes = collection.index_information() for index in indexes: print(index) insertdata() query = "Disrupted" query_vector = generate_embeddings(query) results = collection.aggregate([ { "$search": { "index": "Vector_cosmosSearch", "knnBeta": { "vector": query_vector, "path": "Vector", "k": 10 } } } ]) print(results) for result in results: print(result) if __name__ == "__main__": asyncio.run(main())