Skip to content

Instantly share code, notes, and snippets.

@zoharbabin
Last active May 23, 2023 17:14
Show Gist options
  • Save zoharbabin/df9dc4d6ce345b80cef6ff134a2c05b8 to your computer and use it in GitHub Desktop.
Save zoharbabin/df9dc4d6ce345b80cef6ff134a2c05b8 to your computer and use it in GitHub Desktop.
Sample code to use the LlamaIndex KalturaESearchReader - https://github.com/emptycrown/llama-hub/pull/286
import logging
import sys
from llama_index import (
download_loader,
GPTVectorStoreIndex,
LLMPredictor,
ServiceContext
)
from langchain.llms import OpenAI
from KalturaClient.Plugins.Core import KalturaMediaType
from KalturaClient.Plugins.ElasticSearch import (
KalturaESearchSortOrder, KalturaESearchEntryOrderByFieldName,
KalturaESearchOrderBy, KalturaESearchEntryOrderByItem, KalturaESearchCaptionItem,
KalturaESearchEntryItem, KalturaESearchEntryFieldName, KalturaESearchCaptionFieldName,
KalturaESearchEntryParams, KalturaESearchCategoryEntryItem, KalturaESearchEntryOperator,
KalturaESearchOperatorType, KalturaESearchItemType, KalturaCategoryEntryStatus, KalturaESearchCategoryEntryFieldName
)
## Kaltura credentials
PARTNER_ID: int = 0
API_SECRET: str = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx"
USER_ID: str = "LlamaTester"
KS_TYPE: int = 2
KS_EXPIRY: int = 86400
KS_PRIVILEGES: str = "disableentitlement"
KALTURA_API_ENDPOINT: str = "https://cdnapi-ev.kaltura.com/"
REQUEST_TIMEOUT: int = 500
SHOULD_LOG_API_CALLS: bool = True
MAX_ENTRIES = 1 # how many entries to load (pageSize)
CATEGORY_NAME_TO_FILTER: str = "categoryname" # <-- replace this to your category name
logging.basicConfig(stream=sys.stdout, level=logging.WARN)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
KalturaESearchReader = download_loader(loader_class="KalturaESearchReader",
custom_path="../llama-hub/loader_hub",
loader_hub_url="https://raw.githubusercontent.com/zoharbabin/llama-hub/main/loader_hub/")
reader = KalturaESearchReader(
partner_id=PARTNER_ID,
api_secret=API_SECRET,
user_id=USER_ID,
ks_type=KS_TYPE,
ks_expiry=KS_EXPIRY,
ks_privileges=KS_PRIVILEGES,
kaltura_api_endpoint=KALTURA_API_ENDPOINT,
request_timeout=REQUEST_TIMEOUT,
should_log_api_calls=SHOULD_LOG_API_CALLS
) # type: ignore KalturaESearchReader
search_params = KalturaESearchEntryParams()
# Sort the search results in descending order by entry last updated
search_params.orderBy = KalturaESearchOrderBy()
search_params.orderBy.orderItems = []
search_params.orderBy.orderItems.append(KalturaESearchEntryOrderByItem())
search_params.orderBy.orderItems[0].sortField = KalturaESearchEntryOrderByFieldName.UPDATED_AT
search_params.orderBy.orderItems[0].sortOrder = KalturaESearchSortOrder.ORDER_BY_DESC
# Create an AND relationship between the following search queries -
search_params.searchOperator = KalturaESearchEntryOperator()
search_params.searchOperator.operator = KalturaESearchOperatorType.AND_OP
search_params.searchOperator.searchItems = []
# Find only entries that have captions -
caption_item = KalturaESearchCaptionItem()
caption_item.fieldName = KalturaESearchCaptionFieldName.CONTENT
caption_item.itemType = KalturaESearchItemType.EXISTS
search_params.searchOperator.searchItems.append(caption_item)
# Find only entries that are inside an exact category name -
category_item = KalturaESearchCategoryEntryItem()
category_item.categoryEntryStatus = KalturaCategoryEntryStatus.ACTIVE
category_item.fieldName = KalturaESearchCategoryEntryFieldName.NAME
category_item.addHighlight = False
category_item.itemType = KalturaESearchItemType.EXACT_MATCH
category_item.searchTerm = CATEGORY_NAME_TO_FILTER
search_params.searchOperator.searchItems.append(category_item)
# Find only video entries (KalturaMediaType.VIDEO)
entry_item = KalturaESearchEntryItem()
entry_item.fieldName = KalturaESearchEntryFieldName.MEDIA_TYPE
entry_item.addHighlight = False
entry_item.itemType = KalturaESearchItemType.EXACT_MATCH
entry_item.searchTerm = KalturaMediaType.VIDEO
search_params.searchOperator.searchItems.append(entry_item)
entry_docs = reader.load_data(search_params, True, 5)
#langchain_documents = [d.to_langchain_format() for d in entry_docs]
# LLM Predictor (gpt-3.5-turbo) + service context
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003", streaming=True))
service_context = ServiceContext.from_defaults(
llm_predictor=llm_predictor
)
index = GPTVectorStoreIndex.from_documents(entry_docs, service_context=service_context)
query_engine = index.as_query_engine(
streaming=True,
similarity_top_k=10
)
request = "the top 5 video segments where the speaker discusses the future of events in education"
response_stream = query_engine.query("Provide a json formatted response of the following: " + request +
". Your json response should look like so: {startTime: xxx, endTime:zzz, speakerName: zzz, keywords: nnn} ." +
"startTime represents the time in the video this segment begins. " +
"endTime represents the time in the video this segment ends and another topic begins (segment can be multiple lines long). " +
"speakerName represents the primary speaker talking in that segment. " +
"keyword represents a one-word description of the segment as a title of that segment. " )
response_stream.print_response_stream()
import logging
import sys
from llama_index import GPTVectorStoreIndex, download_loader
# Kaltura credentials
PARTNER_ID: int = 0
API_SECRET: str = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx"
USER_ID: str = "LlamaTester"
KS_TYPE: int = 2
KS_EXPIRY: int = 86400
KS_PRIVILEGES: str = "disableentitlement"
KALTURA_API_ENDPOINT: str = "https://cdnapi-ev.kaltura.com/"
REQUEST_TIMEOUT: int = 500
SHOULD_LOG_API_CALLS: bool = True
MAX_ENTRIES = 1 # how many entries to load (pageSize)
CATEGORY_IDS_TO_FILTER: str = "123,56,6846" # <-- replace this to your categories
logging.basicConfig(stream=sys.stdout, level=logging.WARN)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
KalturaESearchReader = download_loader(loader_class="KalturaESearchReader",
custom_path="../llama-hub/loader_hub",
loader_hub_url="https://raw.githubusercontent.com/zoharbabin/llama-hub/main/loader_hub/")
reader = KalturaESearchReader(
partner_id=PARTNER_ID,
api_secret=API_SECRET,
user_id=USER_ID,
ks_type=KS_TYPE,
ks_expiry=KS_EXPIRY,
ks_privileges=KS_PRIVILEGES,
kaltura_api_endpoint=KALTURA_API_ENDPOINT,
request_timeout=REQUEST_TIMEOUT,
should_log_api_calls=SHOULD_LOG_API_CALLS
) # type: ignore KalturaESearchReader
entry_docs = reader.load_data(search_operator_and=True,
free_text="education",
category_ids=None,
with_captions=True,
max_entries=5)
#pprint(entries)
index = GPTVectorStoreIndex.from_documents(entry_docs)
query_engine = index.as_query_engine()
request = "the top 5 video segments where the speaker discusses the future of events in education"
response = query_engine.query("Provide a json formatted response of the following: " + request +
". Your json response should look like so: {startTime: xxx, endTime:zzz, speakerName: zzz, keywords: nnn} ." +
"startTime represents the time in the video this segment begins. " +
"endTime represents the time in the video this segment ends and another topic begins (segment can be multiple lines long). " +
"speakerName represents the primary speaker talking in that segment. " +
"keyword represents a one-word description of the segment as a title of that segment. " )
print(response)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment