diff --git a/app/audio_embeddings_helper.py b/app/audio_embeddings_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..ddc15b81b91ed3274250b9ab7d906ced02c6db80 --- /dev/null +++ b/app/audio_embeddings_helper.py @@ -0,0 +1,106 @@ +from elasticsearch import Elasticsearch + +from app.elastic_search import AUDIO_EMBEDDINGS_INDEX + + +def get_audio_embeddings( + es: Elasticsearch, tape_year: int, + tape_name: str, timestamp_start_ms: int, + timestamp_end_ms: int +): + """Request the audio_embeddings for the given tape between the given timestamps.""" + + audio_embeddings = [] + lower_border = higher_border = 0 + + # Get lower border + rest = timestamp_start_ms % 1000 + if rest > 500: + lower_border = int(timestamp_start_ms / 1000) * 1000 + 500 + elif rest < 500: + lower_border = int(timestamp_start_ms / 1000) * 1000 + elif rest == 500: + lower_border = timestamp_start_ms + + # Get higher border + rest = timestamp_end_ms % 1000 + if rest > 500: + higher_border = int(timestamp_end_ms / 1000) * 1000 + 1000 + elif rest < 500: + higher_border = int(timestamp_end_ms / 1000) * 1000 + 500 + elif rest == 500: + higher_border = timestamp_end_ms + + for timestamp in range(lower_border, higher_border, 500): + query = { + "query": { + "query_string": { + "query": f"tape_year:{tape_year} AND tape_name:{tape_name} AND timestamp_start_ms:{timestamp}" + } + }, + "size": 1 + } + + query_result = es.search(index=AUDIO_EMBEDDINGS_INDEX, body=query) + audio_embedding = {x["_id"]: x["_source"] for x in query_result["hits"]["hits"]} + if len(audio_embedding) == 0: + continue + # todo handle what happens if the response does not contain the correct audio embedding + audio_embeddings.append(audio_embedding) + + return audio_embeddings + + +def request_script_score_from_elasticsearch(es: Elasticsearch, query_vector: list): + """Request a script score for the given query_vector (audio_embedding) from elasticsearch.""" + + # The cosineSimilarity function calculates the measure of cosine similarity between a given query vector and + # document vectors. + cosine_similarity_source = "cosineSimilarity(params.query_vector, 'embedding_vector') + 1.0" + + # The dotProduct function calculates the measure of dot product between a given query vector and document vectors. + dot_product_source = """ + double value = dotProduct(params.query_vector, 'embedding_vector'); + return sigmoid(1, Math.E, -value); + """ + + # The l1_norm function calculates L1 distance (Manhattan distance) between a given query vector and document + # vectors. + l1_norm_source = "1 / (1 + l1norm(params.query_vector, 'embedding_vector'))" + + # The l2_norm function calculates L2 distance (Euclidean distance) between a given query vector and document + # vectors. + l2_norm_source = "1 / (1 + l2norm(params.query_vector, 'embedding_vector'))" + + document = { + "_source": False, + "size": 20, + "fields": [ + "tape_year", + "tape_name", + "timestamp_start_ms", + "timestamp_end_ms", + ], + "query": { + "script_score": { + "query": { + "bool": { + "filter": { + "term": { + "_type": "_doc" + } + } + } + }, + "script": { + "source": cosine_similarity_source, + "params": { + "query_vector": query_vector + } + } + } + } + } + + response = es.search(index=AUDIO_EMBEDDINGS_INDEX, body=document) + return response diff --git a/app/audio_segments_helper.py b/app/audio_segments_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..590cd889027bde7cb4aa5428630a880b4be97272 --- /dev/null +++ b/app/audio_segments_helper.py @@ -0,0 +1,86 @@ +from elasticsearch import Elasticsearch + +from app.elastic_search import AUDIO_SEGMENTS_INDEX + + +def get_audio_segments(es: Elasticsearch): + """Request 100 available audio segments from elasticsearch.""" + + query = { + "_source": False, + "query": { + "match": { + "_type": { + "query": "_doc" + } + } + }, + "size": 100, + "sort": [ + { + "already_assessed": { + "order": "asc" + }, + "orca_probability": { + "order": "desc" + } + } + ] + } + query_result = es.search(index=AUDIO_SEGMENTS_INDEX, body=query) + audio_segment_ids = [x["_id"] for x in query_result["hits"]["hits"]] + return audio_segment_ids + + +def get_audio_segments_filtered(es: Elasticsearch, year: int, tape: str, time_query: str): + """Request 10000 available audio segments from elasticsearch which match the filter query.""" + + query = { + "_source": False, + "query": { + "query_string": { + "query": f"tape_year:{year} AND tape_name:{tape}" + } + }, + "size": 10000, + "sort": [ + { + "already_assessed": { + "order": "asc" + }, + "orca_probability": { + "order": "desc" + } + } + ] + } + query_result = es.search(index=AUDIO_SEGMENTS_INDEX, body=query) + audio_segment_ids = [x["_id"] for x in query_result["hits"]["hits"]] + return audio_segment_ids + + +def get_audio_segment(es: Elasticsearch, audio_segment_id: str): + """Request information about a specific audio_segment from elasticsearch.""" + query = { + "_source": False, + "query": { + "match": { + "_id": { + "query": audio_segment_id + } + } + }, + "fields": [ + "tape_year", + "tape_name", + "tape_channel", + "timestamp_start_ms", + "timestamp_end_ms", + "already_assessed", + "orca_probability" + ] + } + + query_result = es.search(index=AUDIO_SEGMENTS_INDEX, body=query) + audio_segment = {x["_id"]: x["fields"] for x in query_result["hits"]["hits"]} + return audio_segment diff --git a/app/audio_tape_helper.py b/app/audio_tape_helper.py index e49bf4c7435574e4d6d254aca37698084e8c53d6..10c62b877c0b54382d049924a0ba929b92c362d9 100644 --- a/app/audio_tape_helper.py +++ b/app/audio_tape_helper.py @@ -137,7 +137,6 @@ def get_audio_tape_position_in_year(year: int, audio_tape_name: str, es: Elastic def get_audio_tape_start_date(tape_name: str, year: int, es: Elasticsearch): - query = { "size": 10000, "sort": [ @@ -289,4 +288,55 @@ def get_labeled_segments_per_year(es: Elasticsearch, year_of_audio_tape_ref): year = year_of_audio_tape_ref[tape_ref] insert_labels(year, labels_for_tape["by_labels"]["buckets"]) - return labels_per_year \ No newline at end of file + return labels_per_year + + +def get_audio_tape_information(es: Elasticsearch, tape_year: int, tape_name: str): + """Request information about a specific audio tape.""" + query = { + "_source": False, + "query": { + "query_string": { + "query": f"year:{tape_year} AND name:{tape_name}" + } + }, + "fields": [ + "_id", + "name", + "year", + "left", + "right" + ], + "size": 10 + } + + query_result = es.search(index=AUDIO_TAPES_INDEX, body=query) + # todo error handling + + audio_tape = {x["_id"]: x["fields"] for x in query_result["hits"]["hits"]} + return audio_tape + + +def get_audio_tapes_with_audio_embeddings_available(es: Elasticsearch): + query = { + "_source": False, + "query": { + "match": { + "audio_embeddings_available": { + "query": True + } + } + }, + "size": 10000 + } + + query_result = es.search(index=AUDIO_TAPES_INDEX, body=query) + # todo error handling + + audio_tapes = [x["_id"] for x in query_result["hits"]["hits"]] + names = [] + for name in audio_tapes: + year = name.split("_")[2] + tape = name.split("_")[3] + names.append(f"{year}_{tape}") + return names diff --git a/app/elastic_search.py b/app/elastic_search.py index e46abdc46bb005d9076bd217f85af15c91291e49..d290433b0be49f008b895112dfeab007450e459b 100644 --- a/app/elastic_search.py +++ b/app/elastic_search.py @@ -8,6 +8,9 @@ COMMENT_SEGMENTS_INDEX: str = "comment_segments" LABELED_SEGMENTS_INDEX: str = "labeled_segments" LAB_BOOKS_INDEX: str = "lab_books" LAB_BOOK_ENTRIES_INDEX: str = "lab_book_entries" +AUDIO_SEGMENTS_INDEX: str = "audio_segments" +ORCA_SEQUENCES_INDEX: str = "orca_sequences" +AUDIO_EMBEDDINGS_INDEX: str = "audio_embeddings" class ElasticSearchTimeoutError(Exception): pass diff --git a/app/lab_book_helper.py b/app/lab_book_helper.py index 74c04e4c01439a12e6e6c967a63799cde2dc13f7..fb56046ac4ace602cec855696b5766a8e4f75b7b 100644 --- a/app/lab_book_helper.py +++ b/app/lab_book_helper.py @@ -78,6 +78,9 @@ def get_num_lab_books_per_year(es: Elasticsearch): } } query_result = es.search(index=LAB_BOOKS_INDEX, body=query) + return lab_books_per_year + if (query_result["hits"]["total"]["value"] == 0): + return lab_books_per_year min_year = date.fromisoformat(query_result["aggregations"]["min_year"]["value_as_string"]).year max_year = date.fromisoformat(query_result["aggregations"]["max_year"]["value_as_string"]).year diff --git a/app/main.py b/app/main.py index 6c3190e23524ef56b7f740b4de7c82f56b376fdd..2ed48bb317be5f1c5606579db930bcc5b9f05bab 100644 --- a/app/main.py +++ b/app/main.py @@ -1,10 +1,11 @@ #!/usr/bin/env python - +import uvicorn from fastapi import FastAPI, Request, Response from app.journal import JournalInterceptorRoute from app.routers import labbooks, modify_comment_segment, modify_labbook, modify_labbook_entry, \ - modify_labeled_segment, modify_tape, search, tapes, top, OrchiveAPIRouter + modify_labeled_segment, modify_tape, search, tapes, top, OrchiveAPIRouter, audio_segments, orca_sequences, \ + modify_orca_sequence, modify_audio_segment from app.elastic_search import * @@ -17,11 +18,15 @@ app.include_router(top.router, tags=["Top"]) app.include_router(tapes.router, tags=["Tapes"]) app.include_router(labbooks.router, tags=["Labbooks"]) app.include_router(search.router, tags=["Search"]) +app.include_router(audio_segments.router, tags=["Audio Segments"]) +app.include_router(orca_sequences.router, tags=["Orca Sequences"]) app.include_router(modify_labbook.router, tags=["Modify Labbook"]) app.include_router(modify_labbook_entry.router, tags=["Modify Labbook Entry"]) app.include_router(modify_tape.router, tags=["Modify Tape"]) app.include_router(modify_labeled_segment.router, tags=["Modify Labeled Segment"]) app.include_router(modify_comment_segment.router, tags=["Modify Comment Segment"]) +app.include_router(modify_audio_segment.router, tags=["Modify Audio Segment"]) +app.include_router(modify_orca_sequence.router, tags=["Modify Orca Sequence"]) app_router = OrchiveAPIRouter(route_class=JournalInterceptorRoute) diff --git a/app/orca_sequences_helper.py b/app/orca_sequences_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..e858aa0434ffcc17ce88760ab40b44099fb863bb --- /dev/null +++ b/app/orca_sequences_helper.py @@ -0,0 +1,147 @@ +from elasticsearch import Elasticsearch +from app.elastic_search import ORCA_SEQUENCES_INDEX + + +def get_orca_sequences(es: Elasticsearch): + """Request all available orca_sequences from elasticsearch.""" + + # todo Be aware that if there will be more than 1000 sequences, this request might cause trouble + query = { + "_source": False, + "query": { + "match": { + "_type": { + "query": "_doc" + } + } + }, + "fields": [ + "tape_year", + "tape_name", + "timestamp_start_ms", + "timestamp_end_ms" + ], + "size": 1000, + "sort": [ + { + "tape_year": { + "order": "asc" + }, + "tape_name": { + "order": "asc" + } + } + ] + } + + query_result = es.search(index=ORCA_SEQUENCES_INDEX, body=query) + orca_sequences = {x["_id"]: x["fields"] for x in query_result["hits"]["hits"]} + return orca_sequences + + +def get_orca_sequence(es: Elasticsearch, orca_sequence_id: str): + """Request information about a specific orca_sequences from elasticsearch.""" + + query = { + "_source": False, + "query": { + "match": { + "_id": { + "query": orca_sequence_id + } + } + }, + "fields": [ + "tape_year", + "tape_name", + "tape_channel", + "audio_segment", + "timestamp_start_ms", + "timestamp_end_ms", + "rating", + "tags", + "user_name" + ] + } + + query_result = es.search(index=ORCA_SEQUENCES_INDEX, body=query) + orca_sequence = {x["_id"]: x["fields"] for x in query_result["hits"]["hits"]} + return orca_sequence + + +def get_orca_sequences_tags(es: Elasticsearch): + """Request all tags which were used to describe orca_sequences from elasticsearch.""" + + query = { + "aggs": { + "tags": { + "terms": { + "field": "tags", + "size": 10000 + } + } + }, + "size": 0 + } + + query_result = es.search(index=ORCA_SEQUENCES_INDEX, body=query) + + # Get the bucket from the request + buckets = query_result["aggregations"]["tags"]["buckets"] + + # Loop though the bucket and create a clean dict which contains the tags + tags = {} + for entry in buckets: + tags[entry["key"]] = entry["doc_count"] + + return tags + + +def get_orca_sequences_user_names(es: Elasticsearch): + """Request all usernames which were used to describe orca_sequences from elasticsearch.""" + + query = { + "aggs": { + "tags": { + "terms": { + "field": "user_name", + "size": 10000 + } + } + }, + "size": 0 + } + + query_result = es.search(index=ORCA_SEQUENCES_INDEX, body=query) + + # Get the bucket from the request + buckets = query_result["aggregations"]["tags"]["buckets"] + + # Loop though the bucket and create a clean dict which contains the tags + user_names = {} + for entry in buckets: + user_names[entry["key"]] = entry["doc_count"] + + return user_names + + +def get_orca_sequences_audio_segment(es: Elasticsearch, audio_segment_id: str): + """Request all available orca_sequences for the given audio_segment from elasticsearch.""" + + query = { + "_source": False, + "query": { + "match": { + "audio_segment": { + "query": audio_segment_id + } + } + }, + "size": 1000 + } + + query_result = es.search(index=ORCA_SEQUENCES_INDEX, body=query) + orca_sequences = [x["_id"] for x in query_result["hits"]["hits"]] + return orca_sequences + + diff --git a/app/routers/audio_segments.py b/app/routers/audio_segments.py new file mode 100644 index 0000000000000000000000000000000000000000..0045819806a21c816161cc5c2bdaa621f46c3e43 --- /dev/null +++ b/app/routers/audio_segments.py @@ -0,0 +1,99 @@ +from typing import List +from fastapi.responses import JSONResponse +from pydantic import BaseModel +from app.routers import Message, OrchiveAPIRouter +from app import audio_segments_helper +from ..elastic_search import get_elasticsearch_client + +router = OrchiveAPIRouter() + + +class BrowseAudioSegmentsEntry(BaseModel): + audio_segment_id: str + + +class BrowseAudioSegmentEntry(BaseModel): + tape_year: int + tape_name: str + tape_channel: str + timestamp_start_ms: int + timestamp_end_ms: int + already_assessed: bool + orca_probability: float + + +@router.get("/browse/mark/audio_segments", + response_model=list, + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def browse_audio_segments(): + """Request a list of 100 available audio segments.""" + es = get_elasticsearch_client() + + # Get the available audio_segments + audio_segment_ids = audio_segments_helper.get_audio_segments(es) + + # Check if there are any audio_segments available + if len(audio_segment_ids) == 0: + return JSONResponse(status_code=400, content={"message": f"No audio_segments found."}) + + return audio_segment_ids + + +@router.get("/browse/mark/audio_segments/year/{year}/tape/{tape}/timequery/{timequery}", + response_model=list, + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def browse_audio_segments(year: int, tape: str, timequery: str): + """Request a list of 10000 available audio segments which match the given parameters.""" + es = get_elasticsearch_client() + + # Get the available audio_segments + audio_segment_ids = audio_segments_helper.get_audio_segments_filtered(es, year, tape, timequery) + + # Check if there are any audio_segments available + if len(audio_segment_ids) == 0: + return JSONResponse(status_code=400, content={"message": f"No audio_segments found."}) + + return audio_segment_ids + + +@router.get("/browse/mark/audio_segment/{audio_segment_id}", + response_model=BrowseAudioSegmentEntry, + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def browse_audio_segment(audio_segment_id: str): + """Request information about the given audio segment id.""" + es = get_elasticsearch_client() + + # Get the available information about the audio segment + audio_segment = audio_segments_helper.get_audio_segment(es, audio_segment_id) + + # Check if there was an audio segment with the given id + if len(audio_segment) == 0: + return JSONResponse( + status_code=400, + content={"message": f"No audio_segment with the id: {audio_segment_id} found."} + ) + + # Access the values of the response + audio_segment = audio_segment[audio_segment_id] + + # Fill the response from + audio_seg = { + "tape_year": audio_segment.get("tape_year", [0])[0], + "tape_name": audio_segment.get("tape_name", ["None"])[0], + "tape_channel": audio_segment.get("tape_channel", ["None"])[0], + "timestamp_start_ms": audio_segment.get("timestamp_start_ms", [0])[0], + "timestamp_end_ms": audio_segment.get("timestamp_end_ms", [0])[0], + "already_assessed": audio_segment.get("already_assessed", [False])[0], + "orca_probability": audio_segment.get("orca_probability", [0])[0], + } + + return audio_seg diff --git a/app/routers/modify_audio_segment.py b/app/routers/modify_audio_segment.py new file mode 100644 index 0000000000000000000000000000000000000000..44906adeae19630a252b8e28420239c1325173dc --- /dev/null +++ b/app/routers/modify_audio_segment.py @@ -0,0 +1,69 @@ +from pydantic import BaseModel +from starlette.responses import JSONResponse + +from app.routers import OrchiveAPIRouter, Message +from ..elastic_search import get_elasticsearch_client, AUDIO_SEGMENTS_INDEX + +router = OrchiveAPIRouter() + + +class ModifyAudioSegmentEntry(BaseModel): + audio_segment_id: str + already_assessed: bool + + +@router.put("/audio_segment/{audio_segment_id}", + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def update_audio_segment_assessed(audio_segment_id: str, + browseOrcaSequenceEntryAddUpdate: ModifyAudioSegmentEntry): + """Updates the already assessed information about the given audio segment""" + es = get_elasticsearch_client() + + # Check id the provided data matches provided orca_sequence_id + if audio_segment_id != browseOrcaSequenceEntryAddUpdate.audio_segment_id: + return JSONResponse(status_code=400, content={"message": f"Message body does not match the provided id."}) + + document = {"doc": { + "already_assessed": browseOrcaSequenceEntryAddUpdate.already_assessed, + }} + + # Check if an orca sequence with the provided orca_sequence_id exists + audio_segment_exists = es.exists(index=AUDIO_SEGMENTS_INDEX, id=audio_segment_id) + + # If the orca sequence does not exist, give feedback + if audio_segment_exists: + es.update(index=AUDIO_SEGMENTS_INDEX, id=audio_segment_id, body=document) + return f"Successfully updated orca sequence: id: {audio_segment_id}, {document}" + else: + return JSONResponse( + status_code=400, + content={"message": f"Orca sequence with id {audio_segment_id} does not exist."} + ) + + +@router.delete("/audio_segment/{audio_segment_id}", + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def delete_audio_segment(audio_segment_id: str): + """Delete the given audio segment from elasticsearch.""" + + es = get_elasticsearch_client() + + # Check if the given audio segment exists as entry in elasticsearch + audio_segment_exists = es.exists(index=AUDIO_SEGMENTS_INDEX, id=audio_segment_id) + + # If the audio segment exists, delete it + if audio_segment_exists: + es.delete(index=AUDIO_SEGMENTS_INDEX, id=audio_segment_id) + return f"Successfully deleted audio segment id: {audio_segment_id}" + else: + # It the audio segment does not exist, give feedback + return JSONResponse( + status_code=400, + content={"message": f"Audio segment with id: {audio_segment_id} does not exists."} + ) diff --git a/app/routers/modify_orca_sequence.py b/app/routers/modify_orca_sequence.py new file mode 100644 index 0000000000000000000000000000000000000000..27c307d698810dd68e03fd2d331822873838851b --- /dev/null +++ b/app/routers/modify_orca_sequence.py @@ -0,0 +1,154 @@ +from fastapi.responses import JSONResponse +from pydantic import BaseModel +from app.routers import OrchiveAPIRouter, Message +from ..elastic_search import get_elasticsearch_client, ORCA_SEQUENCES_INDEX + +router = OrchiveAPIRouter() + + +class ModifyOrcaSequenceEntry(BaseModel): + orca_sequence_id: str + tape_year: int + tape_name: str + tape_channel: str + audio_segment: str + timestamp_start_ms: int + timestamp_end_ms: int + rating: int + tags: list + user_name: str + + +class ModifyOrcaSequenceEntryRating(BaseModel): + orca_sequence_id: str + rating: int + + +@router.post("/orca_sequence", + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def add_orca_sequence(browseOrcaSequenceEntryAddUpdate: ModifyOrcaSequenceEntry): + """Adds a orca sequence to elasticsearch.""" + es = get_elasticsearch_client() + + orca_sequence_id = browseOrcaSequenceEntryAddUpdate.orca_sequence_id + + # todo Add handling which check if there are values passed in the model or if the models fields are emtpy + document = { + "tape_year": browseOrcaSequenceEntryAddUpdate.tape_year, + "tape_name": browseOrcaSequenceEntryAddUpdate.tape_name, + "tape_channel": browseOrcaSequenceEntryAddUpdate.tape_channel, + "audio_segment": browseOrcaSequenceEntryAddUpdate.audio_segment, + "timestamp_start_ms": browseOrcaSequenceEntryAddUpdate.timestamp_start_ms, + "timestamp_end_ms": browseOrcaSequenceEntryAddUpdate.timestamp_end_ms, + "rating": browseOrcaSequenceEntryAddUpdate.rating, + "tags": browseOrcaSequenceEntryAddUpdate.tags, + "user_name": browseOrcaSequenceEntryAddUpdate.user_name + } + + # Check if an orca sequence with the provided orca_sequence_id already exists + orca_sequence_exists = es.exists(index=ORCA_SEQUENCES_INDEX, id=orca_sequence_id) + + # If the orca sequence already exists, give feedback + if orca_sequence_exists: + return JSONResponse( + status_code=400, + content={"message": f"Orca sequence with id {orca_sequence_id} already exists."} + ) + else: + es.index(index=ORCA_SEQUENCES_INDEX, id=orca_sequence_id, body=document) + return f"Successfully added orca sequence: id: {orca_sequence_id}, {document}" + + +@router.put("/orca_sequence/{orca_sequence_id}", + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def update_orca_sequence(orca_sequence_id: str, browseOrcaSequenceEntryAddUpdate: ModifyOrcaSequenceEntry): + """Updates information about the given orca sequence""" + es = get_elasticsearch_client() + + # Check id the provided data matches provided orca_sequence_id + if orca_sequence_id != browseOrcaSequenceEntryAddUpdate.orca_sequence_id: + return JSONResponse(status_code=400, content={"message": f"Message body does not match the provided id."}) + + # todo Add handling which check if there are values passed in the model or if the models fields are emtpy + document = {"doc": { + "tape_year": browseOrcaSequenceEntryAddUpdate.tape_year, + "tape_name": browseOrcaSequenceEntryAddUpdate.tape_name, + "tape_channel": browseOrcaSequenceEntryAddUpdate.tape_channel, + "audio_segment": browseOrcaSequenceEntryAddUpdate.audio_segment, + "timestamp_start_ms": browseOrcaSequenceEntryAddUpdate.timestamp_start_ms, + "timestamp_end_ms": browseOrcaSequenceEntryAddUpdate.timestamp_end_ms, + "rating": browseOrcaSequenceEntryAddUpdate.rating, + "tags": browseOrcaSequenceEntryAddUpdate.tags, + "user_name": browseOrcaSequenceEntryAddUpdate.user_name + }} + + # Check if an orca sequence with the provided orca_sequence_id exists + orca_sequence_exists = es.exists(index=ORCA_SEQUENCES_INDEX, id=orca_sequence_id) + + # If the orca sequence does not exist, give feedback + if orca_sequence_exists: + es.update(index=ORCA_SEQUENCES_INDEX, id=orca_sequence_id, body=document) + return f"Successfully updated orca sequence: id: {orca_sequence_id}, {document}" + else: + return JSONResponse( + status_code=400, + content={"message": f"Orca sequence with id {orca_sequence_id} does not exist."} + ) + + +@router.put("/orca_sequence/rating/{orca_sequence_id}", + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def update_orca_sequence_rating(orca_sequence_id: str, browseOrcaSequenceUpdateRating: ModifyOrcaSequenceEntryRating): + """Updates the rating information about the given orca sequence""" + es = get_elasticsearch_client() + + # Check id the provided data matches provided orca_sequence_id + if orca_sequence_id != browseOrcaSequenceUpdateRating.orca_sequence_id: + return JSONResponse(status_code=400, content={"message": f"Message body does not match the provided id."}) + + # todo Add handling which check if there are values passed in the model or if the models fields are emtpy + document = {"doc": { + "rating": browseOrcaSequenceUpdateRating.rating, + }} + + # Check if an orca sequence with the provided orca_sequence_id exists + orca_sequence_exists = es.exists(index=ORCA_SEQUENCES_INDEX, id=orca_sequence_id) + + # If the orca sequence does not exist, give feedback + if orca_sequence_exists: + es.update(index=ORCA_SEQUENCES_INDEX, id=orca_sequence_id, body=document) + return f"Successfully updated orca sequence: id: {orca_sequence_id}, {document}" + else: + return JSONResponse( + status_code=400, + content={"message": f"Orca sequence with id {orca_sequence_id} does not exist."} + ) + + +@router.delete("/orca_sequence/{orca_sequence_id}") +async def delete_orca_sequence(orca_sequence_id: str): + """Deletes the given orca sequence""" + es = get_elasticsearch_client() + + # Check if the given orca sequence exists as entry in elasticsearch + orca_sequence_exists = es.exists(index=ORCA_SEQUENCES_INDEX, id=orca_sequence_id) + + # If the audio segment exists, delete it + if orca_sequence_exists: + es.delete(index=ORCA_SEQUENCES_INDEX, id=orca_sequence_id) + return f"Successfully deleted orca sequence id: {orca_sequence_id}" + else: + # It the orca sequence does not exist, give feedback + return JSONResponse( + status_code=400, + content={"message": f"Orca sequence with id: {orca_sequence_id} does not exists."} + ) diff --git a/app/routers/orca_sequences.py b/app/routers/orca_sequences.py new file mode 100644 index 0000000000000000000000000000000000000000..531f8065123725c6f5bed3da0bcb6873f1e1fa6f --- /dev/null +++ b/app/routers/orca_sequences.py @@ -0,0 +1,193 @@ +from typing import List + +from fastapi.responses import JSONResponse +from pydantic import BaseModel +from app.routers import Message, OrchiveAPIRouter +from app import orca_sequences_helper, audio_tape_helper +from ..elastic_search import get_elasticsearch_client + +router = OrchiveAPIRouter() + + +class BrowseOrcaSequenceEntry(BaseModel): + orca_sequence_id: str + tape_year: int + tape_name: str + tape_channel: str + audio_segment: str + timestamp_start_ms: int + timestamp_end_ms: int + rating: int + tags: list + user_name: str + + +@router.get("/browse/mark/orca_sequences", + response_model=list, + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def browse_orca_sequences(): + """Request a list of all available orca sequences.""" + es = get_elasticsearch_client() + + # Get the available orca_sequence ids + orca_sequences = orca_sequences_helper.get_orca_sequences(es) + + # Check if there were any orca_sequences available + if len(orca_sequences) == 0: + return JSONResponse(status_code=400, content={"message": f"No orca_sequences found."}) + + return [sequence for sequence in orca_sequences] + + +@router.get("/browse/mark/orca_sequences/embeddings_available/{embeddings_available}", + response_model=list, + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def browse_orca_sequences(embeddings_available: bool): + """Request a list of all available orca sequences.""" + es = get_elasticsearch_client() + + if embeddings_available: + # Get the available orca_sequence ids for which audio_embeddings are imported + tapes_with_embeddings = audio_tape_helper.get_audio_tapes_with_audio_embeddings_available(es) + + # Get all available orca sequences + orca_sequences_raw = orca_sequences_helper.get_orca_sequences(es) + + orca_sequences = [] + # Filter the response for orca sequences with audio embeddings available + for orca_sequence in orca_sequences_raw: + year = orca_sequences_raw.get(orca_sequence).get("tape_year")[0] + tape = orca_sequences_raw.get(orca_sequence).get("tape_name")[0] + query = f"{year}_{tape}" + if query in tapes_with_embeddings: + orca_sequences.append(orca_sequence) + else: + # Get the available orca_sequence ids no matter if there are audio embeddings available for them + orca_sequences = orca_sequences_helper.get_orca_sequences(es) + + # Check if there were any orca_sequences available + if len(orca_sequences) == 0: + return JSONResponse(status_code=400, content={"message": f"No orca_sequences found."}) + + return [sequence for sequence in orca_sequences] + + +@router.get("/browse/mark/orca_sequence/{orca_sequence_id}", + response_model=BrowseOrcaSequenceEntry, + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def browse_orca_sequence(orca_sequence_id: str): + """Request information about the given orca sequence id.""" + + es = get_elasticsearch_client() + + # Get the information about the orca_sequence + orca_sequence = orca_sequences_helper.get_orca_sequence(es, orca_sequence_id) + + # Check if there was an orca_sequence with this id available + if len(orca_sequence) == 0: + return JSONResponse( + status_code=400, + content={"message": f"No orca_sequence with id: {orca_sequence_id} found."} + ) + + # Access the values of the response + orca_sequence = orca_sequence[orca_sequence_id] + print(orca_sequence) + + # Fill in the response form + orca_seq = { + "orca_sequence_id": orca_sequence_id, + "tape_year": orca_sequence.get("tape_year", [0])[0], + "tape_name": orca_sequence.get("tape_name", ["None"])[0], + "tape_channel": orca_sequence.get("tape_channel", ["None"])[0], + "audio_segment": orca_sequence.get("audio_segment", ["None"])[0], + "timestamp_start_ms": orca_sequence.get("timestamp_start_ms", ["None"])[0], + "timestamp_end_ms": orca_sequence.get("timestamp_end_ms", ["None"])[0], + "rating": orca_sequence.get("rating", ["None"])[0], + "tags": orca_sequence.get("tags", []), + "user_name": orca_sequence.get("user_name", ["None"])[0], + } + + return orca_seq + + +@router.get("/browse/mark/orca_sequences/tags", + response_model=dict, + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def browse_orca_sequences_tags(): + """Request a list of all tags which were used to describe orca sequences from elasticsearch.""" + es = get_elasticsearch_client() + + # Get the available orca_sequences ids for the given audio_segment + orca_sequences_tags = orca_sequences_helper.get_orca_sequences_tags(es) + + # Check if there were any orca_sequences available for the given audio_segment + if len(orca_sequences_tags) == 0: + return JSONResponse( + status_code=404, + content={"message": f"No tags found for orca_sequences."} + ) + + return orca_sequences_tags + + +@router.get("/browse/mark/orca_sequences/user_names", + response_model=dict, + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def browse_orca_sequences_user_names(): + """Request a list of all usernames which were used to describe orca sequences from elasticsearch.""" + es = get_elasticsearch_client() + + # Get the available orca_sequences ids for the given audio_segment + orca_sequences_user_names = orca_sequences_helper.get_orca_sequences_user_names(es) + + # Check if there were any orca_sequences available for the given audio_segment + if len(orca_sequences_user_names) == 0: + return JSONResponse( + status_code=404, + content={"message": f"No user names found for orca_sequences."} + ) + + return orca_sequences_user_names + + +@router.get("/browse/mark/orca_sequences/audio_segment/{audio_segment_id}", + response_model=list, + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def browse_orca_sequences_audio_segment(audio_segment_id: str): + """Request a list of all available orca sequences for the given audio segment id.""" + es = get_elasticsearch_client() + + # Get the available orca_sequences ids for the given audio_segment + orca_sequences = orca_sequences_helper.get_orca_sequences_audio_segment(es, audio_segment_id) + + # Check if there were any orca_sequences available for the given audio_segment + if len(orca_sequences) == 0: + # todo Right now, even if there exists no audio_segment with the provided id, we send the following response. + # Maybe handle those cases with different responses. + # One for "There is no audio_segment with the provided id" and one which indicates, that there are no + # orca_sequences for the given audio_segment. + return JSONResponse( + status_code=404, + content={"message": f"No orca_sequences for the audio_segment with id: {audio_segment_id} found."} + ) + + return orca_sequences diff --git a/app/routers/search.py b/app/routers/search.py index dd464a3f989d0f30cbe5c7e16ad75359fe682235..d408b58fe1c2d05a2712f7d3292fcfd655fd9ae8 100644 --- a/app/routers/search.py +++ b/app/routers/search.py @@ -1,16 +1,20 @@ +from collections import defaultdict from enum import Enum from datetime import date +from math import inf from typing import Any, List, Optional from pydantic import BaseModel +from starlette.responses import JSONResponse from app.elastic_search import AUDIO_TAPES_INDEX, COMMENT_SEGMENTS_INDEX, LABELED_SEGMENTS_INDEX, \ LAB_BOOKS_INDEX, LAB_BOOK_ENTRIES_INDEX -from . import OrchiveAPIRouter +from . import OrchiveAPIRouter, Message from .labbooks import BoundingBox +from .. import orca_sequences_helper, audio_embeddings_helper, audio_tape_helper +from ..audio_embeddings_helper import request_script_score_from_elasticsearch from ..elastic_search import get_elasticsearch_client - router = OrchiveAPIRouter() @@ -87,14 +91,14 @@ async def search( es = get_elasticsearch_client() search_fields_query = { - "bool": { - "should": [ - {"match": {"words": query}}, # comment_segment_index - {"match": {"label": query}}, # labeled_segment_index - {"match": {"value": query}}, # lab_book_entry_index - ] - } + "bool": { + "should": [ + {"match": {"words": query}}, # comment_segment_index + {"match": {"label": query}}, # labeled_segment_index + {"match": {"value": query}}, # lab_book_entry_index + ] } + } if from_date is not None or to_date is not None: date_filter_query = {"range": {"search_timeline_start": {}}} @@ -272,14 +276,14 @@ async def search_timeline( } es_query["aggs"] = { - "by_start_date": { - "date_histogram": { - "field": "search_timeline_start", - "calendar_interval": "year", - "format": "yyyy" - } - }, - } + "by_start_date": { + "date_histogram": { + "field": "search_timeline_start", + "calendar_interval": "year", + "format": "yyyy" + } + }, + } search_result_tape = es.search(index=",".join([COMMENT_SEGMENTS_INDEX, LABELED_SEGMENTS_INDEX]), body=es_query) search_result_labbooks = es.search(index=LAB_BOOK_ENTRIES_INDEX, body=es_query) @@ -291,7 +295,7 @@ async def search_timeline( year = int(tape_results["key_as_string"]) count = tape_results["doc_count"] total_count_tapes += count - results[year] = { "year": year, "total_tapes": count, "total_labbooks": 0 } + results[year] = {"year": year, "total_tapes": count, "total_labbooks": 0} years_already_in_result.append(year) total_count_labbooks = 0 @@ -304,14 +308,14 @@ async def search_timeline( if year in results: results[year]["total_labbooks"] = count else: - results[year] = { "year": year, "total_tapes": 0, "total_labbooks": count } + results[year] = {"year": year, "total_tapes": 0, "total_labbooks": count} years_already_in_result.append(year) timespan_audio_query = { "size": 0, "aggs": { "min": {"min": {"field": "year"}}, - "max": {"max": {"field": "year" } } + "max": {"max": {"field": "year"}} } } timespan_audio = es.search(index=AUDIO_TAPES_INDEX, body=timespan_audio_query) @@ -319,8 +323,8 @@ async def search_timeline( timespan_lab_books_query = { "size": 0, "aggs": { - "min": {"min": {"field": "start","format": "yyyy"}}, - "max": {"max": {"field": "end","format": "yyyy"}} + "min": {"min": {"field": "start", "format": "yyyy"}}, + "max": {"max": {"field": "end", "format": "yyyy"}} } } timespan_lab_books = es.search(index=LAB_BOOKS_INDEX, body=timespan_lab_books_query) @@ -333,7 +337,7 @@ async def search_timeline( years_in_result = list(results.keys()) for year in range(timespan_min, timespan_max + 1): if year not in years_in_result: - results[year] = { "year": year, "total_tapes": 0, "total_labbooks": 0 } + results[year] = {"year": year, "total_tapes": 0, "total_labbooks": 0} results = list(sorted(results.values(), key=lambda x: x["year"])) @@ -343,3 +347,264 @@ async def search_timeline( "total_tapes": total_count_tapes, "hits": results } + + +class SearchAudioSegmentsForOrcaSequence(BaseModel): + total: int + total_labbooks: int + total_tapes: int + hits: List[SearchResultTimelineHits] + + +@router.get("/search-audio_segments/{orca_sequence_id}", response_model=list, + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def search_audio_segments(orca_sequence_id: str): + """Search all audio_segments for matches for a given orca_sequences audio_vector.""" + es = get_elasticsearch_client() + + # Get the orca_sequence in order to find the origin_audio_segment + orca_sequence = orca_sequences_helper.get_orca_sequence(es, orca_sequence_id=orca_sequence_id) + + # Check if there was an orca_sequence with this id available + if len(orca_sequence) == 0: + return JSONResponse( + status_code=400, + content={"message": f"No orca_sequence with id: {orca_sequence_id} found."} + ) + + # Get the origin_tape_name in order to find the audio_embedding which need to be retrieved + tape_year = orca_sequence[orca_sequence_id]["tape_year"][0] + tape_name = orca_sequence[orca_sequence_id]["tape_name"][0] + timestamp_start_ms = orca_sequence[orca_sequence_id]["timestamp_start_ms"][0] + timestamp_end_ms = orca_sequence[orca_sequence_id]["timestamp_end_ms"][0] + + # Get the audio_embeddings + audio_embeddings = audio_embeddings_helper.get_audio_embeddings( + es=es, + tape_year=tape_year, + tape_name=tape_name, + timestamp_start_ms=timestamp_start_ms, + timestamp_end_ms=timestamp_end_ms + ) + + # Check if there were audio_embeddings available with the given specifications + if len(audio_embeddings) == 0: + return JSONResponse( + status_code=400, + content={"message": f"No audio_embeddings for the orca sequence {orca_sequence_id} found."} + ) + + responses = {} + + # Iterate through the audio_embeddings + for embeddings in audio_embeddings: + audio_embedding_id = next(iter(embeddings)) + # Get the embedding_vector from the response + embedding_vector = embeddings[audio_embedding_id]["embedding_vector"] + + # Request similar embedding_vectors based on the previously retrieved embedding_vector + responses[audio_embedding_id] = audio_embeddings_helper.request_script_score_from_elasticsearch( + es=es, + query_vector=embedding_vector + ) + + # Check if there were similar audio_embeddings available in elasticsearch + if len(responses) == 0: + return JSONResponse( + status_code=400, + content={"message": f"No audio_embedding matches found."} + ) + + # Get the names of the query audio_segments which work as keys for the response dict + keys = responses.keys() + + # Get the borders of the query audio_segment + lower_bound = inf + upper_bound = 0 + + # Loop though the names of the audio_segments to find the borders + for key in keys: + query_name = key.split("_") # ['audio', 'embedding', '1985', '001A', '2232000', '2234000'] + query_origin_tape_year = f"{query_name[2]}" + query_origin_tape_name = f"{query_name[3]}" + query_lower_bound = int(query_name[4]) # 2232000 + query_upper_bound = int(query_name[5]) # 2234000 + + # If the current query_lower_bound is lower than the lower_bound + if query_lower_bound < lower_bound: + lower_bound = query_lower_bound + + # If the current query_upper_bound is lower than the upper_bound + if query_upper_bound > upper_bound: + upper_bound = query_upper_bound + + # Loop through all key in the whole response + # One key represents one request for an audio_embedding. + # The response for this request contains a collection of similar audio_segments + # All responses which make sense are stored in cleaned_response_query which will be used to process the data + cleaned_response = {} + for key in keys: + cleaned_response[key] = [] + + # Get the first response + response = responses.get(key) + + # Get the first hit-layer from the response + hits_layer_1 = response.get("hits") + + # Get the max_score from the first hit-layer + # The max_score represents the highest value which was found as matching score for the search vector + max_score = hits_layer_1.get("max_score") + + # Get the hits (response audio_segments) which were found for the query vector + hits_layer_2 = hits_layer_1.get("hits") + + for audio_segment in hits_layer_2: + # Get the individual score of the response + segment_score = audio_segment.get("_score") + + # Get source from the audio_segment + segment_fields = audio_segment.get("fields") + origin_tape_year = segment_fields.get("tape_year")[0] + origin_tape_name = segment_fields.get("tape_name")[0] + timestamp_start_ms = segment_fields.get("timestamp_start_ms")[0] + timestamp_end_ms = segment_fields.get("timestamp_end_ms")[0] + + # If the current audio_segment lies inside the borders of the audio_segment which was the search query, + # it can be ignored + if (lower_bound <= timestamp_start_ms <= upper_bound) and \ + (lower_bound <= timestamp_end_ms <= upper_bound) and \ + (query_origin_tape_name == origin_tape_name) and \ + (query_origin_tape_year == origin_tape_year): + continue + + # Append the current audio_segment to the currents key cleaned list + cleaned_response[key].append(audio_segment) + + # Get all individual responses in order to find a sequence which can be used as match + individual_responses = [] + for key in keys: + for suggestion in cleaned_response[key]: + individual_responses.append(suggestion) + + # Sort individual_responses + individual_responses = sorted( + individual_responses, + key=lambda x: (x["fields"]["tape_year"][0], x["fields"]["tape_name"][0], x["fields"]["timestamp_start_ms"][0]) + ) + + # Remove double elements from the list + cleared_responses = [] + buffer = "" + for resp in individual_responses: + if buffer != resp["_id"]: + cleared_responses.append(resp) + buffer = resp["_id"] + + # Isolate segments + isolated_responses = defaultdict(list) + counter = 0 + for resp in cleared_responses: + if len(isolated_responses[counter]) == 0: + isolated_responses[counter].append(resp) + continue + + previous_element = isolated_responses[counter][-1] + if (previous_element["fields"]["tape_name"][0] == resp["fields"]["tape_name"][0]) and \ + (previous_element["fields"]["tape_year"][0] == resp["fields"]["tape_year"][0]) and \ + (int(previous_element["fields"]["timestamp_start_ms"][0] + 500) == int( + resp["fields"]["timestamp_start_ms"][0])): + isolated_responses[counter].append(resp) + else: + counter += 1 + isolated_responses[counter].append(resp) + + # Check if any embeddings are already marked in an orca sequence + all_orca_sequences = orca_sequences_helper.get_orca_sequences(es=es) + isolated_responses_copy = isolated_responses.copy() + + for orca_sequence in all_orca_sequences.keys(): + tape_year = all_orca_sequences[orca_sequence]["tape_year"][0] + tape_name = all_orca_sequences[orca_sequence]["tape_name"][0] + timestamp_start_ms = int(all_orca_sequences[orca_sequence]["timestamp_start_ms"][0]) + timestamp_end_ms = int(all_orca_sequences[orca_sequence]["timestamp_end_ms"][0]) + + for resp in isolated_responses.keys(): + for dct in isolated_responses.get(resp): + embedding_tape_year = dct.get("fields").get("tape_year")[0] + embedding_tape_name = dct.get("fields").get("tape_name")[0] + embedding_timestamp_start_ms = int(dct.get("fields").get("timestamp_start_ms")[0]) + # embedding_timestamp_end_ms = resp.get(k).get("fields").get("timestamp_end_ms")[0] + + if (tape_year == embedding_tape_year) and (tape_name == embedding_tape_name) and ( + timestamp_start_ms < embedding_timestamp_start_ms < timestamp_end_ms): + if resp in isolated_responses_copy: + del isolated_responses_copy[resp] + break + + # Get the best fits by the length of the responses + best_fits = [] + for key in isolated_responses_copy.keys(): + best_fits.append( + (key, len(isolated_responses_copy.get(key)), abs(len(isolated_responses_copy.get(key)) - len(responses)))) + + best_fits.sort(key=lambda x: x[2], reverse=False) + + result_fits = [] + if len(best_fits) >= 3: + result_fits = best_fits[:3] + elif len(best_fits) < 3: + result_fits = best_fits + + result_embeddings = [] + for fit in result_fits: + embedding = isolated_responses_copy.get(fit[0]) + embedding_tape_year = embedding[0].get("_id").split("_")[2] + embedding_tape_name = embedding[0].get("_id").split("_")[3] + lower_embedding_border = int(embedding[0].get("_id").split("_")[4]) + higher_embedding_border = int(embedding[-1].get("_id").split("_")[5]) + result_embedding_range_name = f"audio_embedding_{embedding_tape_year}_{embedding_tape_name}_{lower_embedding_border}_{higher_embedding_border}" + result_embeddings.append(result_embedding_range_name) + + return result_embeddings + + +@router.get("/search-audio_tape/tape_name/{tape_name}/tape_year/{tape_year}", response_model=list, + responses={ + 400: {"model": Message}, + 404: {"model": Message} + }) +async def search_audio_tape(tape_year: int, tape_name: str): + """Search all audio tapes for the given tape and return it's characteristics.""" + es = get_elasticsearch_client() + + # Get the orca_sequence in order to find the origin_audio_segment + audio_tape = audio_tape_helper.get_audio_tape_information(es, tape_year=tape_year, tape_name=tape_name) + + # Check if there was an orca_sequence with this id available + if len(audio_tape) == 0: + return JSONResponse( + status_code=400, + content={"message": f"No audio tape with name: {tape_year} {tape_name} found."} + ) + + audio_tape_ids = [] + for key in audio_tape: + audio_tape_ids.append(key) + + tape_information = [] + for tape_id in audio_tape_ids: + tape = audio_tape.get(tape_id) + information = { + "_id": tape_id, + "name": tape.get("name", None)[0], + "year": tape.get("year", None)[0], + "left": tape.get("left", None)[0], + "right": tape.get("right", None)[0], + } + tape_information.append(information) + + return tape_information diff --git a/app/routers/tapes.py b/app/routers/tapes.py index 0f9cfb24e3cb16710377ebe0e088d92db023bd2b..5972b1703cca0654f2413c708a173c397d36a787 100644 --- a/app/routers/tapes.py +++ b/app/routers/tapes.py @@ -6,6 +6,7 @@ from fastapi.responses import JSONResponse from pydantic import BaseModel +import app.routers.top from app.elastic_search import AUDIO_TAPES_INDEX, COMMENT_SEGMENTS_INDEX, LABELED_SEGMENTS_INDEX, \ LAB_BOOKS_INDEX, LAB_BOOK_ENTRIES_INDEX from app.routers import Message, OrchiveAPIRouter @@ -333,4 +334,34 @@ async def browse_year_tape_labbookpages(year: int, tape: str): pages=pages )) - return labbook_pages \ No newline at end of file + return labbook_pages + + +@router.get("/browse/tapes", response_model=dict, responses={404: {"model": Message}}) +async def browse_year_tapes(): + """Returns a list of available audio tapes for all years.""" + es = get_elasticsearch_client() + + years_summaries = await app.routers.top.browse_years() + + available_years = {} + for dct in years_summaries: + year = dct.get("year") + if year is None: + continue + + response_tapes = audio_tape_helper.get_audio_tapes(es=es, year=year) + if len(response_tapes) == 0: + continue + + tapes = [] + for key in response_tapes.keys(): + tape_name = response_tapes.get(key).get("name")[0] + tapes.append(tape_name) + + available_years[year] = { + "tapes": tapes, + "maxMin": 0 + } + + return available_years