diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 56cbb2dbeb10e2f9c685dfcd1d6a64367b73ae35..0000000000000000000000000000000000000000 --- a/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -files/* -!files/.gitkeep - -data01/ \ No newline at end of file diff --git a/LICENSE b/LICENSE index f97299de436d6c08a18a76ca64c3d1334c719fdb..dfa76f11fdec5f036dcc7551eff604fe3df3a6ca 100644 --- a/LICENSE +++ b/LICENSE @@ -4,4 +4,4 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md index f1a1a4f0c3795ef406b74f86db542d83ebefa481..1fd15b6ae79e2222a1f89d0c2b7eafee9fb9f601 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Orchive3 - From Zero to Hero -This README guides you to the process of both setting up the complete stack for -the Orchive3 application - either for a production environment or locally. +This README guides you to the process of setting up the complete stack for the +Orchive3 application - either for a production environment or locally. ## About the Orchive3 @@ -27,11 +27,24 @@ The Orchive3 tech stack consists of ## Setup +### Clone all repositories + +Before you start, you should now clone all Orchive repositories into a folder. +The rest of this document assumes that all repositories are located at +`~/orchive3` but you can of course also adjust the commands (and docker-compose +files) to match your personal preference. + +```bash +mkdir -p ~/orchive3 && cd ~/orchive3 +git clone git@git.informatik.fh-nuernberg.de:orchive3/general.git +git clone git@git.informatik.fh-nuernberg.de:orchive3/orchive3-backend.git +git clone git@git.informatik.fh-nuernberg.de:orchive3/orchive3-frontend.git +``` + ### Download the data -Before you start, you should download the audio data and the labbbooks from the -Orchive website. -You can use the following shell commands to download the files +You should now download the audio data and the labbbooks from the Orchive +website. You can use the following shell commands to download the files ```bash mkdir -p ~/orchive3 && cd ~/orchive3 @@ -43,18 +56,44 @@ mv orchive.cs.uvic.ca/ data/ For local development, you can also only download a subset of the data to speed up the download and execution of the ingest scripts. -### Clone all repositories - -You should now clone all Orchive repositories into a folder. The rest of this -document assumes that all repositories are located at `~/orchive3` but you can -of course also adjust the commands to match your personal preference. +After the download move the files to the +`~orchive3/general/deployment/files/raw` folder like so: ```bash -mkdir -p ~/orchive3 && cd ~/orchive3 -git clone git@git.informatik.fh-nuernberg.de:orchive3/general.git -git clone git@git.informatik.fh-nuernberg.de:orchive3/orchive3-backend.git -git clone git@git.informatik.fh-nuernberg.de:orchive3/orchive3-frontend.git -git clone git@git.informatik.fh-nuernberg.de:orchive3/orchive3-infrastructure.git +mv ~/orchive3/data/mp3 ~/orchive3/general/deployment/files/raw/tapes +mv ~/orchive3/data/labbooks ~/orchive3/general/deployment/files/raw/labbooks +``` + +Your folder structure should now look like this: + +```text +~/orchive3/ +┣━ general/ +┃ ┗━ deployment/ +┃ ┣━ files/ +┃ ┃ ┗━ raw/ +┃ ┃ ┣━ tapes/ +┃ ┃ ┃ ┣━ ... +┃ ┃ ┃ ┣━ 1999/ +┃ ┃ ┃ ┃ ┣━ 001A.mp3 +┃ ┃ ┃ ┃ ┗━ ... +┃ ┃ ┃ ┗━ ... +┃ ┃ ┗━ labbooks/ +┃ ┃ ┣━ ... +┃ ┃ ┣━ 1999/ +┃ ┃ ┃ ┣━ aug2299_sep0299.pdf +┃ ┃ ┃ ┗━ ... +┃ ┃ ┗━ ... +┃ ┣━ docker.compose.yml +┃ ┣━ default.conf +┃ ┗━ ... +┣━ orchive3-backend/ +┃ ┗━ ... +┣━ orchive3-frontend/ +┃ ┗━ ... +┣━ orchive3-infrastructure/ +┃ ┗━ ... +┗━ ... ``` ### Start all services @@ -62,17 +101,17 @@ git clone git@git.informatik.fh-nuernberg.de:orchive3/orchive3-infrastructure.gi #### For local development For local development, you can start all services individually. -All services offer a Docker Compose file to start them without any other local +All services include a Docker Compose file to start them without any other local dependencies other than Docker and Docker Compose. Additionally, the services -have additional information in their READMEs that explain how to start them +have additional information in their READMEs that explains how to start them without Docker (e.g., for development). The services that you have to start are located at: -- `~/orchive3/orchive3-infrastructure/elastic` -- `~/orchive3/orchive3-backend/cdn` -- `~/orchive3/orchive3-backend/server` -- `~/orchive3/orchive3-frontend/orchive3-frontend` +- `~/orchive3/general/local-deployment/elastic` +- `~/orchive3/general/local-deployment/cdn` +- `~/orchive3/orchive3-backend` +- `~/orchive3/orchive3-frontend` You can `cd` into the respective folders and start them with a simple `docker-compose up` command. Please note that the elastic search service @@ -96,7 +135,7 @@ a production environment. Assuming that you cloned all repositories like described above, you can start all services as follows: ```bash -cd ~/orchive3/general && docker-compose up +cd ~/orchive3/general/deployment && docker-compose up ``` If the repositories reside in different folders, you have to adjust the @@ -104,17 +143,71 @@ If the repositories reside in different folders, you have to adjust the ### Populate the database and process Orchive data -After you started all services, you have to populate the database and process -the data for the CDN. - -The scripts to populate the database are located at -`~/orchive3/orchive3-backend/ingest_scripts`. The README in this folder guides -you through the process of populating the database. +After you started all services, you have to preprocess the data (split mp3s into +channels and extract images from the PDFs) and populate the database. -The scripts to process the data for the CDN are located at -`~/orchive3/orchive3-backend/cdn/preprocessing_scripts` and explained in the -README of the parent folder at `~/orchive3/orchive3-backend/cdn/`. Make sure -that you use the correct output folder for the preprocessing_scripts -(i.e., `~/orchive3/general/files` for production). +The scripts to process the data for the are located at +`~/orchive3/general/scripts/preprocessing_scripts` and explained in the +README of the parent folder at `~/orchive3//general/scripts/cdn/`. Make sure +that you use the correct output folder for the preprocessing_scripts (i.e., +`~/orchive3/general/deployment/files` for production). -Both are one time steps that only need to be executed once. +The scripts to populate the database are located at +`~/orchive3/general/scripts/ingest_scripts`. The README in this folder guides +you through the process of populating the database. Some of the ingest scripts +require the preprocessed data, so you should only run them after executing the +preprocessing scripts. + +Both are one-time-steps that only need to be executed once. + +## Final folder structure + +After executing every step as described, your folder structure should look +like this: + +```text +~/orchive3/ +┣━ general/ # The general repository +┃ ┗━ deployment +┃ ┣━ data01/ # The persisted data of the Elasticsearch node for the production environment +┃ ┃ ┗━ ... +┃ ┣━ files/ # The files served by the CDN under the /files sub-url +┃ ┃ ┣━ raw/ +┃ ┃ ┃ ┣━ tapes/ # The raw mp3 files as downloaded from the Orchive +┃ ┃ ┃ ┃ ┣━ ... +┃ ┃ ┃ ┃ ┣━ 1999/ +┃ ┃ ┃ ┃ ┃ ┣━ 001A.mp3 +┃ ┃ ┃ ┃ ┃ ┗━ ... +┃ ┃ ┃ ┃ ┗━ ... +┃ ┃ ┃ ┗━ labbooks/ # The raw labbook pdf files as downloaded from the Orchive +┃ ┃ ┃ ┣━ ... +┃ ┃ ┃ ┣━ 1999/ +┃ ┃ ┃ ┃ ┣━ aug2299_sep0299.pdf +┃ ┃ ┃ ┃ ┗━ ... +┃ ┃ ┃ ┗━ ... +┃ ┃ ┣━ tapes/ # The prepocessed tapes (split channels and ASR data) +┃ ┃ ┃ ┣━ ... +┃ ┃ ┃ ┣━ 1999/ +┃ ┃ ┃ ┃ ┣━ 001A/ +┃ ┃ ┃ ┃ ┃ ┣━ left.jsonl +┃ ┃ ┃ ┃ ┃ ┣━ left.mp3 +┃ ┃ ┃ ┃ ┃ ┣━ right.jsonl +┃ ┃ ┃ ┃ ┃ ┗━ right.mp3 +┃ ┃ ┃ ┃ ┗━ ... +┃ ┃ ┃ ┗━ ... +┃ ┃ ┗━ labboks/ # The preprocessed PDFs (extracted images for every page) +┃ ┃ ┣━ ... +┃ ┃ ┣━ 1999/ +┃ ┃ ┃ ┣━ aug2299_sep0299/ +┃ ┃ ┃ ┃ ┣━ 1.jpg +┃ ┃ ┃ ┃ ┗━ ... +┃ ┃ ┃ ┗━ ... +┃ ┃ ┗━ ... +┃ ┣━ docker.compose.yml # The docker-compose file for the production environment +┃ ┣━ default.conf # The config file of the NGINX reverse proxy for the production environment +┃ ┗━ ... +┣━ orchive3-backend/ +┃ ┗━ ... +┣━ orchive3-frontend/ +┃ ┗━ ... +``` diff --git a/deployment/.gitignore b/deployment/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..0cbf53752af433a51191db811be1439a0a9071cf --- /dev/null +++ b/deployment/.gitignore @@ -0,0 +1,2 @@ +files/* +data01/ \ No newline at end of file diff --git a/deployment/README.md b/deployment/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e4aefe68c8dba8f932637e21f08273c4bc61c90 --- /dev/null +++ b/deployment/README.md @@ -0,0 +1 @@ +# Production deployment \ No newline at end of file diff --git a/default.conf b/deployment/default.conf similarity index 100% rename from default.conf rename to deployment/default.conf diff --git a/docker-compose.yml b/deployment/docker-compose.yml similarity index 90% rename from docker-compose.yml rename to deployment/docker-compose.yml index 71532c30729f13a1ae38e5021e94510c66e3348b..5663a4738f5929b522fcddbe75d7d26fac94193c 100644 --- a/docker-compose.yml +++ b/deployment/docker-compose.yml @@ -18,13 +18,13 @@ services: ports: - "9200:9200" frontend: - build: "../orchive3-frontend/orchive3-frontend/." + build: "../../orchive3-frontend/." cdn: image: nginx volumes: - ./files:/usr/share/nginx/html server: - build: "../orchive3-backend/server/." + build: "../../orchive3-backend/." environment: - ELASTICSEARCH_HOST=database:9200 - ROOT_PATH=/api diff --git a/files/.gitkeep b/deployment/files/.gitkeep similarity index 100% rename from files/.gitkeep rename to deployment/files/.gitkeep diff --git a/deployment/files/raw/.gitkeep b/deployment/files/raw/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/local-deployment/README.md b/local-deployment/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e3ecb872a28a1c0d0cda92a767a05ece4b754904 --- /dev/null +++ b/local-deployment/README.md @@ -0,0 +1 @@ +# Local deployment (for development) diff --git a/local-deployment/cdn/.gitignore b/local-deployment/cdn/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..a7b2aeecddcd1eb0f5c0f942d185db09cbaa6eae --- /dev/null +++ b/local-deployment/cdn/.gitignore @@ -0,0 +1,2 @@ +files/* +!files/.gitkeep \ No newline at end of file diff --git a/local-deployment/cdn/README.md b/local-deployment/cdn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..973ccdf7f694aa9bc6fa1a454e9b5d17b625b050 --- /dev/null +++ b/local-deployment/cdn/README.md @@ -0,0 +1,17 @@ +# Orchive CDN + +A Docker Nginx container that serves the files that are necessary for the +web application. + +## Start container + +```bash +docker-compose up +``` + +This will start a nginx server that serves the files in the `/files` folder on +port `8001`. + +## Populate `/files` + +See README in `../../scripts`. diff --git a/local-deployment/cdn/docker-compose.yml b/local-deployment/cdn/docker-compose.yml new file mode 100644 index 0000000000000000000000000000000000000000..f8548144581da306c8df063adbbd14c89a5dc51d --- /dev/null +++ b/local-deployment/cdn/docker-compose.yml @@ -0,0 +1,6 @@ +cdn: + image: nginx + volumes: + - ./files:/usr/share/nginx/html + ports: + - "8001:80" \ No newline at end of file diff --git a/local-deployment/cdn/files/.gitkeep b/local-deployment/cdn/files/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/local-deployment/elastic/.gitignore b/local-deployment/elastic/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..16d2285fe378e508a7086b621ee53426b4d8a800 --- /dev/null +++ b/local-deployment/elastic/.gitignore @@ -0,0 +1 @@ +data01 \ No newline at end of file diff --git a/local-deployment/elastic/README.md b/local-deployment/elastic/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f8a109eb115883ea0fbca47d8692c9f55cc61e2 --- /dev/null +++ b/local-deployment/elastic/README.md @@ -0,0 +1,21 @@ +# Elasticsearch Docker + +## Prerequisites + +You have to perform the following steps to allow the cluster to start properly: + +### Increase `vm.max_map_count` + +The `vm.max_map_count` kernel setting must be set to at least 262144. +On Linux, you can append `vm.max_map_count=262144` to `/etc/sysctl.conf` and run + +```bash +sysctl -w vm.max_map_count=262144 +``` + +to apply the settings without a restart. + +For other systems, please consult the official +[Elasticsearch documentation][elastic-max-map-count]. + +[elastic-max-map-count]: https://www.elastic.co/guide/en/elasticsearch/reference/7.12/docker.html#_set_vm_max_map_count_to_at_least_262144 diff --git a/local-deployment/elastic/docker-compose.yml b/local-deployment/elastic/docker-compose.yml new file mode 100644 index 0000000000000000000000000000000000000000..6cff66a268afd67e36799b2abe0a14ec58e3764b --- /dev/null +++ b/local-deployment/elastic/docker-compose.yml @@ -0,0 +1,26 @@ +version: '2.2' +services: + es01: + image: docker.elastic.co/elasticsearch/elasticsearch:7.12.1 + container_name: es01 + environment: + - node.name=es01 + - cluster.name=es-docker-cluster + - discovery.type=single-node + - bootstrap.memory_lock=true + - "ES_JAVA_OPTS=-Xms512m -Xmx512m" + - TAKE_FILE_OWNERSHIP=true + ulimits: + memlock: + soft: -1 + hard: -1 + volumes: + - ./data01:/usr/share/elasticsearch/data + ports: + - 9200:9200 + networks: + - elastic + +networks: + elastic: + driver: bridge \ No newline at end of file diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0001dd0fdba548cc3b64841b72523b9c7c4f19f2 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,10 @@ +# Scripts + +This folder contains the 4 types of scripts that exist: + +- `preprocessing` scripts to pre-process the raw data files +- `asr` scripts to automatically transcribe the tapes +- `ocr` scripts to extract labbook entries from the labbooks +- `ingest` scripts to fill the Elasticsearch database + +It's recommended to execute the scripts in this order. diff --git a/scripts/asr/README.md b/scripts/asr/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8fe3f615b23c73e87df1995f0d3b45d9ecca16bd --- /dev/null +++ b/scripts/asr/README.md @@ -0,0 +1,21 @@ +# ASR Scripts + +The scripts in this folder are used to process the audio files and transcribe +them with the [Mod9 ASR Engine](https://mod9.io/). + +## Scripts + +### asr.sh + +The script takes the path to a `.mp3` audio file as the first parameter and +prints the response of the Mod9 engine. + +Example `./asr.sh ~/orchive3/general/files/tapes/1999/001A/left.mp3` + +### asr_all_the_files.sh + +The script takes a path and recursively processes all mp3 files in the directory +and all the subdirectories. The transcript is saved in the same directory as the +file itself. + +Example `./asr_all_the_files.sh ~/orchive3/general/files/tapes/` diff --git a/scripts/asr/asr.sh b/scripts/asr/asr.sh new file mode 100755 index 0000000000000000000000000000000000000000..155b2eb2fd2fe8d7eb7e44d41a2a30bab4089181 --- /dev/null +++ b/scripts/asr/asr.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +asr_server_host=dev.mod9.io +asr_server_port=9900 + +# Read command.json that contains the configuration for the ASR service +json=$(tr -d '\r\n ' < command.json) + +# Convert the given MP3 file to WAV and store it in a temporary file +temp_file=$(mktemp) +sox $1 -q -V1 -t wav -r 16000 -e signed ${temp_file} remix 1 + +# Pass the config and WAV file to the ASR service +(echo ${json}; cat ${temp_file}) | nc $asr_server_host $asr_server_port + +rm ${temp_file} diff --git a/scripts/asr/asr_all_the_files.sh b/scripts/asr/asr_all_the_files.sh new file mode 100755 index 0000000000000000000000000000000000000000..df6e0ed586ac24641e212e2c04038a2c044617b2 --- /dev/null +++ b/scripts/asr/asr_all_the_files.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +for f in $(find $1 -name '*.mp3'); do + echo "Processing $f" + ./asr.sh $f > ${f%.*}.jsonl +done diff --git a/scripts/asr/command.json b/scripts/asr/command.json new file mode 100644 index 0000000000000000000000000000000000000000..123d1cf7f0a0951e63767cbb1afd962261e31a1d --- /dev/null +++ b/scripts/asr/command.json @@ -0,0 +1,9 @@ +{ + "command": "recognize", + "asr-model": "en_video", + "format": "wav", + "speed": 9, + "word-intervals": true, + "word-alternatives": 5, + "batch-threads": 1 +} diff --git a/scripts/ingest/README.md b/scripts/ingest/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f1187b8c5dc0c6426c7df4b176b77102eda0911c --- /dev/null +++ b/scripts/ingest/README.md @@ -0,0 +1,199 @@ +# Elastic Search Ingest Scripts for Orchive3 + +## Requirements + +### Python + +- elasticsearch +- pypdf2 + +## Procedure for creating indices and importing data + +One has to execute the following scripts in this order to create the indices +required for Orchive3 and to import the data. + +**Note:** All of these script require additional parameters. These parameters +are explained later in the document. + +1. `create_indices.py` +2. `import_audio_tapes.py` +3. `import_lab_books.py` +4. `import_knn_labels.py` +5. `import_asr_data.py` +6. `import_lab_book_entries.py` + +## Usage of `create_indices.py` + +This script creates all required indices in the specified elastic search +instance. + +**Note:** This script deletes all existing indices with name conflict. + +Indices that get created by this script: + +- `audio_tapes` +- `comment_segments` +- `labeled_segments` +- `lab_books` +- `lab_book_entries` + +### Parameters + +`./create_indices.py <elasticsearch host>` + +| Parameter Name | Explanation | +| ---------------------- | ------------------------------------------------------------------------------------- | +| `<elasticsearch host>` | The hostname and port of the target elasticsearch instance. Example: `localhost:9200` | + +### Usage example + +`./create_indices.py localhost:9200` + +## Usage of `import_audio_tapes.py` + +This scripts imports all tapes based on their name and there position in a +directory hierarchy. + +The tapes have to have the file ending `.mp3` or `.wav` to be recognized by the +script and have to be located in a folder that is named after a year. + +Here is an example for the directory structure: + +```text +<parent folder> +┣━ 1985 +┃ ┣━ 001A.mp3 +┃ ┣━ 001B.mp3 +┃ ┗━ ... +┣━ 1986 +┃ ┗━ ... +┗━ ... +``` + +**Note:** +The script can't decide which channel contains the orca sounds and which has the +comments. So the fields `left` and `right` always get set to `none`. + +### Parameters + +`./import_audio_tapes.py <elasticsearch host> <parent folder>` + +| Parameter Name | Explanation | +| ---------------------- | ------------------------------------------------------------------------------------- | +| `<elasticsearch host>` | The hostname and port of the target elasticsearch instance. Example: `localhost:9200` | +| `<parent folder>` | The path to the folder that contains the tapes in the specified hierarchy. | + +### Usage example + +`./import_audio_tapes.py localhost:9200 ~/orchive3/general/deployment/files/raw/tapes` + +## Usage of `import_lab_books.py` + +This script imports the lab books into the `lab_books` index. It also extracts +the start and end dates of the lab books form the file name and determines the +page count. The lab books don't have to be in a specific directory hierarchy but +nested folder structures are supported. + +### Parameters + +`./import_lab_books.py <elasticsearch host> <path to labbooks>` + +| Parameter Name | Explanation | +| ---------------------- | ------------------------------------------------------------------------------------- | +| `<elasticsearch host>` | The hostname and port of the target elasticsearch instance. Example: `localhost:9200` | +| `<path to labbooks>` | The path to the folder that contains the lab books. | + +### Usage example + +`./import_lab_books.py localhost:9200 ~/orchive3/general/deployment/files/raw/labbooks` + +## Usage of `import_knn_labels.py` + +This script imports the labels that were created by the "knn classificator". The +labels have to be stored in a tab separated text file. + +**Note:** The output file of the "knn classificator" doesn't contain the channel +on which the labeled segment was found. This script sets the `channel` field +always to `left`. + +### Parameters + +`./import_knn_labels.py <elasticsearch host> <path to knn file>` + +| Parameter Name | Explanation | +| ---------------------- | ------------------------------------------------------------------------------------- | +| `<elasticsearch host>` | The hostname and port of the target elasticsearch instance. Example: `localhost:9200` | +| `<path to knn file>` | The path to the file `knn_all_out_unique.txt`. | + +### Usage example + +`./import_knn_labels.py localhost:9200 ~/orchive3/general/example-data/knn_all_out_unique.txt` + +## Usage of `import_asr_data.py` + +This script imports transcripts created by the mod9.io asr engine into the +`comment_segments` index. Only transcripts for audio tapes that are already +stored in the `audio_tapes` index will be imported. It's recommended to use +`import_all_asr_data.sh` instead. + +### Parameters + +`./import_asr_data.py <elasticsearch host> <channel name> <tape name> <year>` + +| Parameter Name | Explanation | +| ---------------------- | --------------------------------------------------------------------------------------------------------------- | +| `<elasticsearch host>` | The hostname and port of the target elasticsearch instance. Example: `localhost:9200` | +| `<channel name>` | The name of the channel that was used to create the transcripts. Valid values are `left` and `right` | +| `<tape name>` | The non-unique of the processed name. Example: `001A` for the tape `001A.mp3` | +| `<year>` | The year of the the processed tape. This is important, because the `<tape name>` is not unique. Example: `1985` | + +The input for the actual transcripts is the `stdin` stream. One `json`-document +has to be exactly within one own line. + +### Usage example + +`cat ~/orchive3/general/example-data/asr/001A_1985_c_asr.jsonl | ./import_asr_data.py localhost:9200 left 001A 1985` + +The used `001A_1985_c_asr.jsonl` file stores one `json` document that was +returned by the mod9.io asr engine per line. You can also use the output of the +asr engine directly without the intermediary step of storing the data in a +dedicated text file. + +## Usage of `import_all_asr_data.sh` + +Like `import_asr_data.py` but takes iterates over all files in a given folder. + +### Parameters + +`./import_all_asr_data.sh <elasticsearch host> <path to transcripts>` + +| Parameter Name | Explanation | +| ----------------------- | ----------------------------------------------------------------------------------------------------- | ------------ | +| `<elasticsearch host>` | The hostname and port of the target elasticsearch instance. Example: `localhost:9200` | +| `<path to transcripts>` | The path to the transcript files (`.jsonl`). Must have the folder structure `<year>/<tapeName>/<right | left>.jsonl` | + +### Usage example + +`./import_all_asr_data.sh ~/orchive3/general/deployment/files/tapes/ localhost:9200` + +## Usage of `import_lab_book_entries.py` + +This script imports the detected lab book entries from a tsv file. Consult the +section `Notes on OCR for detecting lab book entries` for further details on how +these lab book entries were detected. + +### Parameters + +`./import_lab_book_entries.py <elasticsearch host> <path to tsv>` + +| Parameter Name | Explanation | +| ---------------------- | ------------------------------------------------------------------------------------- | +| `<elasticsearch host>` | The hostname and port of the target elasticsearch instance. Example: `localhost:9200` | +| `<path to tsv>` | The path to the `.tsv` file containing the lab book entries | + +The input for the actual transcripts is the `stdin` stream. One `json`-document +has to be exactly within one own line. + +### Usage example + +`./import_lab_book_entries.py localhost:9200 ~/orchive3/general/example-data/lab_book_entries.tsv` diff --git a/scripts/ingest/__pycache__/constants.cpython-38.pyc b/scripts/ingest/__pycache__/constants.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ba15dc12367b0d62740305de6f5d2ce813b33e9e Binary files /dev/null and b/scripts/ingest/__pycache__/constants.cpython-38.pyc differ diff --git a/scripts/ingest/__pycache__/utils.cpython-38.pyc b/scripts/ingest/__pycache__/utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..87d4ca4da3e5f76040590ae43e73b8508f77f3f6 Binary files /dev/null and b/scripts/ingest/__pycache__/utils.cpython-38.pyc differ diff --git a/scripts/ingest/constants.py b/scripts/ingest/constants.py new file mode 100755 index 0000000000000000000000000000000000000000..c116d2fcc018f476a27325c2adf6cd67435b83fe --- /dev/null +++ b/scripts/ingest/constants.py @@ -0,0 +1,9 @@ +AUDIO_TAPES_INDEX = "audio_tapes" +COMMENT_SEGMENTS_INDEX = "comment_segments" +LABELED_SEGMENTS_INDEX = "labeled_segments" +LAB_BOOKS_INDEX = "lab_books" +LAB_BOOK_ENTRIES_INDEX = "lab_book_entries" + +SAMPLE_RATE_HZ = 44100.0 + +VALID_FILE_ENDINGS = [".mp3", ".wav"] \ No newline at end of file diff --git a/scripts/ingest/create_indices.py b/scripts/ingest/create_indices.py new file mode 100755 index 0000000000000000000000000000000000000000..be95aa77128575ed7960f922905d89ce3ef8836b --- /dev/null +++ b/scripts/ingest/create_indices.py @@ -0,0 +1,134 @@ +#!/usr/bin/python3 + +import elasticsearch + +from constants import * +from utils import * + +index_settings = {} + +audio_tape_index = { + "settings": index_settings, + + "mappings": { + "properties": { + "name": {"type": "keyword"}, + "year": {"type": "integer"}, + "left": {"type": "keyword"}, + "right": {"type": "keyword"} + } + } +} + +comment_segment_index = { + "settings": index_settings, + + "mappings": { + "properties": { + "audio_tape_ref": {"type": "keyword"}, + "channel": {"type": "keyword"}, + "start": {"type": "integer"}, + "end": {"type": "integer"}, + "transcript": {"type": "text"}, + "words": {"type": "text"}, + "search_timeline_start": {"type": "date", "format": "yyyy-MM-dd"}, + "search_timeline_end": {"type": "date", "format": "yyyy-MM-dd"}, + } + } +} + +labeled_segment_index = { + "settings": index_settings, + + "mappings": { + "properties": { + "audio_tape_ref": {"type": "keyword"}, + "channel": {"type": "keyword"}, + "start": {"type": "integer"}, + "end": {"type": "integer"}, + "label": {"type": "keyword"}, + "classificator_id": {"type": "text"}, + "search_timeline_start": {"type": "date", "format": "yyyy-MM-dd"}, + "search_timeline_end": {"type": "date", "format": "yyyy-MM-dd"}, + } + } + +} + + +lab_book_index = { + "settings": index_settings, + + "mappings": { + "properties": { + "name": {"type": "keyword"}, + "start": {"type": "date", "format": "yyyy-MM-dd"}, + "end": {"type": "date", "format": "yyyy-MM-dd"}, + "page_count": {"type": "integer"}, + } + } +} + + +lab_book_entry_index = { + "settings": index_settings, + + "mappings": { + "properties": { + "lab_book_ref": {"type": "keyword"}, + "type": {"type": "keyword"}, + "value": {"type": "text"}, + "position": {"type": "float"}, + "page": {"type": "integer"}, + "search_timeline_start": {"type": "date", "format": "yyyy-MM-dd"}, + "search_timeline_end": {"type": "date", "format": "yyyy-MM-dd"}, + } + } +} + + +indices_to_create = { + AUDIO_TAPES_INDEX: audio_tape_index, + COMMENT_SEGMENTS_INDEX: comment_segment_index, + LABELED_SEGMENTS_INDEX: labeled_segment_index, + LAB_BOOKS_INDEX: lab_book_index, + LAB_BOOK_ENTRIES_INDEX: lab_book_entry_index +} + +def get_es_host() -> str: + if len(sys.argv) == 2: + return sys.argv[1] + else: + exit_with_error("No host was defined in the command line parameters!\n" + \ + "(Example usage: ./create_indices.py localhost:9200) ") + + +def create_index(es_clt: elasticsearch.Elasticsearch, index_name: str, index_definition: dict): + + try: + # Try to create the index + es_clt.indices.create(index_name, + body=index_definition, + wait_for_active_shards=1) + + except elasticsearch.exceptions.RequestError: + # delete the existing index ... + print("Deleted the already existing index", index_name, "to recreate it.") + es_clt.indices.delete(index_name, ignore=404) + + # ... and try one more time. This script shall fail, if it doesn't work this time either + es_clt.indices.create(index_name, + body=index_definition, + wait_for_active_shards=1) + + +def create_orchive3_indicies(es_clt): + + for name, definition in zip(indices_to_create.keys(), indices_to_create.values()): + create_index(es_clt, name, definition) + + +if __name__ == "__main__": + host = get_es_host() + es_clt = get_es_client(host) + create_orchive3_indicies(es_clt) diff --git a/scripts/ingest/import_all_asr_data.sh b/scripts/ingest/import_all_asr_data.sh new file mode 100755 index 0000000000000000000000000000000000000000..e5af0aca44e8ca21ab9269665a93b1112dd8fba4 --- /dev/null +++ b/scripts/ingest/import_all_asr_data.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +ES_HOST=$1 # e.g. localhost:9200 +DIR=$2 # e.g. ~/orchive3/general/files/tapes/ + +for year_dir in $DIR*/ ; do + for tape_dir in $year_dir*/ ; do + year=`awk -F'/' '{ a = length($NF) ? $NF : $(NF-1); print a }' <<< $year_dir` + tape=`awk -F'/' '{ a = length($NF) ? $NF : $(NF-1); print a }' <<< $tape_dir` + + if [ -f $tape_dir/right.jsonl ]; then + cat $tape_dir/right.jsonl | ./import_asr_data.py $ES_HOST right $tape $year + fi + if [ -f $tape_dir/left.jsonl ]; then + cat $tape_dir/left.jsonl | ./import_asr_data.py $ES_HOST left $tape $year + fi + echo "Importing $year / $tape" + done +done \ No newline at end of file diff --git a/scripts/ingest/import_asr_data.py b/scripts/ingest/import_asr_data.py new file mode 100755 index 0000000000000000000000000000000000000000..2c3ac57c45463e102260d81ddccd0b25482e4261 --- /dev/null +++ b/scripts/ingest/import_asr_data.py @@ -0,0 +1,101 @@ +#!/usr/bin/python3 +from typing import List +import json +import re + +from utils import * +from constants import * + + +def get_detected_words(data) -> List[str]: + words = [] + for word in data["words"]: + for alternative in word["alternatives"]: + w = alternative["word"] + if len(w.strip()) > 0: + words.append(w) + + # remove duplicates + words = list(dict.fromkeys(words)) + + return words + + +def get_transcript_interval(json_document) -> [float, float]: + # todo use transcript-intervals instead of word-intervals + # todo this also requires to change the command for the mod9 asr server + start = float("inf") + end = 0 + for word in json_document["words"]: + word_interval = word["interval"] + start = min(start, word_interval[0], word_interval[1]) + end = max(end, word_interval[0], word_interval[1]) + + # convert form seconds to ms + start = int(start * 1000) + end = int(end * 1000) + + return start, end + + +def import_detected_stuff_form_json(es_clt: es.Elasticsearch, data: str, audio_tape_ref: str, audio_tape_year: int, channel: str) -> bool: + data = json.loads(data) + + if "transcript" not in data.keys(): + return False + + transcript = data["transcript"] + + # Ignore transcripts that only contain [<something>] tokens + if re.search("^(\[\w+\]\s*)*$", transcript): + return False + words = get_detected_words(data) + interval = get_transcript_interval(data) + + comment_segment = { + "audio_tape_ref": audio_tape_ref, + "channel": channel, + "start": interval[0], + "end": interval[1], + "transcript": transcript, + "words": words, + "search_timeline_start": str(audio_tape_year) + "-01-01", + "search_timeline_end": str(audio_tape_year) + "-12-31", + } + + es_clt.index(index=COMMENT_SEGMENTS_INDEX, body=comment_segment) + return True + + +if __name__ == "__main__": + if len(sys.argv) != 5: + exit_with_error("Not enough command line parameters provided!\r\n" + + "(Example usage: ./import_asr_data.py localhost:9200 <channel> <tape name> <year>)\r\n" + + "The json documents need to get passed over via stdin") + + host = sys.argv[1] + channel = sys.argv[2] + tape_name = sys.argv[3] + year = sys.argv[4] + + try: + year = int(year) + except: + exit_with_error("Year isn't a valid int number.") + + es_clt = get_es_client(host) + if es_clt is None: + exit_with_error("Can't connect to the specified elastic search instance.") + + audio_tape_ref, audio_tape = get_es_tape(es_clt, tape_name, year) + if audio_tape_ref is None: + exit_with_error("There is no matching document in the elastic search index.") + + imported_counter = 0 + + for json_document in sys.stdin: + result = import_detected_stuff_form_json(es_clt, json_document, audio_tape_ref, audio_tape["year"], channel) + if result: + imported_counter += 1 + + print(imported_counter, "imported transcripts for", tape_name, "in year", year) diff --git a/scripts/ingest/import_audio_tapes.py b/scripts/ingest/import_audio_tapes.py new file mode 100755 index 0000000000000000000000000000000000000000..0235672018cbb4e98cefafd1711bc0fce266d43c --- /dev/null +++ b/scripts/ingest/import_audio_tapes.py @@ -0,0 +1,60 @@ +#!/usr/bin/python3 +from utils import * +from constants import * + + +def insert_audio_tape_into_elasticsearch(es_clt: es.Elasticsearch, year: int, + tape_name: str, left: str, right: str): + document = { + "name": tape_name, + "year": year, + "left": left, + "right": right + } + + # this should be a unique key for identifying the document + document_id = "audio_tape_" + str(year) + "_" + tape_name + + es_clt.index(index=AUDIO_TAPES_INDEX, id=document_id, body=document) + + +def import_audio_tapes(es_clt: es.Elasticsearch, directory: str): + """ + The audio tapes have to be stored in the following directory-structure: + <year>/<tapename>.mp3 + + """ + + for d in os.listdir(directory): + d_path = os.path.join(directory, d) + + if os.path.isdir(d_path) and (not d.startswith(".")): + + if not d.isdecimal(): + continue + + # check if it is a valid year + year = int(d) + if year < 1900 or year > 2100: + continue + + for f in os.listdir(d_path): + + # check if it is a file with the correct file ending + if os.path.isfile(os.path.join(d_path, f)) and len(f) > 4 and f[-4:] in VALID_FILE_ENDINGS: + insert_audio_tape_into_elasticsearch(es_clt, year, f[0:-4], "none", "none") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + exit_with_error("Not enough command line parameters provided!\n" + + "(Example usage: ./import_audio_tapes.py localhost:9200 ~/orchive3/data/mp3)\n" + + "The audio tapes must be stored in a dir structure with the format\n" + + " <path_to_tapes>/<year>/<tapename>.[mp3|wav]\n" + + "so that they can be imported with this script") + + host = sys.argv[1] + audio_tapes_dir = sys.argv[2] + + es_clt = get_es_client(host) + import_audio_tapes(es_clt, audio_tapes_dir) diff --git a/scripts/ingest/import_knn_labels.py b/scripts/ingest/import_knn_labels.py new file mode 100755 index 0000000000000000000000000000000000000000..f811edf271805d697ef9e0e0c1c79af7d313aadb --- /dev/null +++ b/scripts/ingest/import_knn_labels.py @@ -0,0 +1,122 @@ +#!/usr/bin/python3 +import os +import typing + +from utils import * +from constants import * + + +class KnnLabel: + label_name: str + tape_name: str + year: int + start_ms: int + stop_ms: int + + +def convert_sample_nr_to_ms(sample_nr: int) -> int: + sample_length = 1.0 / float(SAMPLE_RATE_HZ) * 1000 # in ms + return round(sample_length * sample_nr) + + +def interpret_finding(f: str) -> KnnLabel: + # Findings of the knn classificator look like: + # orca_1_1985_003A_39226950_39667950.wav + + f1 = f.split(".")[0] # remove the .wav + f2 = f1.split("_") + + label = KnnLabel() + + try: + label.tape_name = f2[3] + label.year = int(f2[2]) + + start_sample = f2[4] + label.start_ms = convert_sample_nr_to_ms(int(start_sample)) + + stop_sample = f2[5] + label.stop_ms = convert_sample_nr_to_ms(int(stop_sample)) + except: + sys.stderr.write("Failed to parse \"" + f.strip() + "\"\n") + + return label + + +def interpret_line(l: str) -> typing.List[KnnLabel]: + l1 = l.split("\t") + label_name = l1[0], + + knn_label = interpret_finding(l1[1]) + knn_label.label_name = label_name[0] + + return knn_label + + +def load_knn_labels_from_txt(path_to_txt: str) -> typing.List[KnnLabel]: + loaded_labels = [] + + f = open(path_to_txt) + for line in f.readlines(): + loaded_label = interpret_line(line) + loaded_labels.append(loaded_label) + + return loaded_labels + + +def import_knn_labels(es_clt: es.Elasticsearch, path_to_txt: str) -> typing.Tuple[int, int]: + knn_labels = load_knn_labels_from_txt(path_to_txt) + + failed = 0 + imported = 0 + + for knn_label in knn_labels: + audio_tape_ref, audio_tape = get_es_tape(es_clt, knn_label.tape_name, knn_label.year) + + if not audio_tape_ref: + sys.stderr.write("Didn't find tape " + \ + knn_label.tape_name + " (" + str(knn_label.year) + + ") in the audio_tapes index.\n") + + failed += 1 + continue + year = audio_tape["year"] + + try: + labeled_segment = { + "audio_tape_ref": audio_tape_ref, + # todo get the correct channel + # Currently there is no proper way to determine which channel contains primarily orca noises and + # the txt file with the knn labels doesn't specify either which on which channel the finding was made. + "channel": "left", + "start": knn_label.start_ms, + "end": knn_label.stop_ms, + "label": knn_label.label_name, + "classificator_id": "knn", + "search_timeline_start": str(year) + "-01-01", + "search_timeline_end": str(year) + "-12-31", + } + + es_clt.index(index=LABELED_SEGMENTS_INDEX, body=labeled_segment) + + imported += 1 + except Exception as err: + print(err) + + return imported, failed + + +if __name__ == "__main__": + if len(sys.argv) != 3: + exit_with_error("Not enough command line parameters provided!\n" + + "(Example usage: ./import_knn_labels.py localhost:9200 ~/orchive3/general/example-data/knn_all_out_unique.txt)\n") + + host = sys.argv[1] + path_to_knn_txt = sys.argv[2] + + es_clt = get_es_client(host) + if es_clt is None: + exit_with_error("Failed to connect to the specified elastic search instance.") + + imported, failed = import_knn_labels(es_clt, path_to_knn_txt) + print("Imported:", imported, "; Failed:", failed) diff --git a/scripts/ingest/import_lab_book_entries.py b/scripts/ingest/import_lab_book_entries.py new file mode 100755 index 0000000000000000000000000000000000000000..7b1dead051b9722d258f4c0ff311cc3adc26cc46 --- /dev/null +++ b/scripts/ingest/import_lab_book_entries.py @@ -0,0 +1,57 @@ +#!/usr/bin/python3 +import json + +from constants import * +from utils import * + + +def import_lab_book_entries(es_clt, file_path): + f = open(file_path) + + num_imported = 0 + + for line in f: + if len(line.strip()) == 0: + continue + + fields = line.split("\t") + lab_book_name = fields[0] + entry = fields[1] + + lab_book = get_lab_book(es_clt, lab_book_name) + if lab_book is None: + print("Couldn't find the parent lab book with the name " + lab_book_name) + continue + + entry = json.loads(entry) + + entry["lab_book_ref"] = lab_book["_id"] + if "search_timeline_start" not in entry: + entry["search_timeline_start"] = lab_book["_source"]["start"] + + if "search_timeline_end" not in entry: + entry["search_timeline_end"] = lab_book["_source"]["end"] + + es_clt.index(index=LAB_BOOK_ENTRIES_INDEX, body=entry) + num_imported += 1 + + f.close() + return num_imported + + +if __name__ == "__main__": + + if len(sys.argv) != 3: + print("Not enough command line parameters provided!\r\n" + + "(Example usage: ./import_lab_book_entries.py localhost:9200 ~/orchive3/general/example-data/lab_book_entries.tsv)\r\n") + sys.exit(os.EX_USAGE) + + host = sys.argv[1] + path_to_tsv = sys.argv[2] + + es_clt = get_es_client(host) + if es_clt is None: + exit_with_error("Can't connect to the specified elastic search instance.") + + num_imported = import_lab_book_entries(es_clt, path_to_tsv) + print("Imported " + str(num_imported) + " lab book entries") diff --git a/scripts/ingest/import_lab_books.py b/scripts/ingest/import_lab_books.py new file mode 100755 index 0000000000000000000000000000000000000000..64d4aa8cd19f92c2b4872d252cdd51e9c19f6272 --- /dev/null +++ b/scripts/ingest/import_lab_books.py @@ -0,0 +1,124 @@ +#!/usr/bin/python3 + +from PyPDF2 import PdfFileReader + +from constants import * +from utils import * + +months_lut = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, + "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12} + + +def parse_date_string(date_string: str) -> str: + date_string = date_string.strip() + l = len(date_string) + + month_str = "" + day_str = "" + year_str = "" + + # Format mmmDyy or mmDyyyy + if l == 6 or l == 8: + month_str = date_string[:3] + day_str = date_string[3] + year_str = date_string[4:] + + # Format mmmDDyy or mmDDyyyy + elif l == 7 or l == 9: + month_str = date_string[:3] + day_str = date_string[3:5] + year_str = date_string[5:] + + # Unsupported Format + else: + raise ValueError() + + # Convert the date string into a number + if not month_str.lower() in months_lut.keys(): + raise ValueError("Can't determin month.") + + month = months_lut[month_str.lower()] + + day = int(day_str) + year = int(year_str) + + # Convert two digit years into four digit years + if year < 100: + if year > 50: + year += 1900 + else: + year += 2000 + + # Generate a yyyy-MM-dd string + return "{:04d}-{:02d}-{:02d}".format(year, month, day) + + +def extract_dates_from_filename(filename: str): + # remove .pdf ending + name = filename.split(".")[0] + + date_strings = name.split("_") + + start_date = parse_date_string(date_strings[0]) + end_date = parse_date_string(date_strings[1]) + + return start_date, end_date + + +def get_num_pages_of_pdf(path_to_pdf) -> int: + pdf = PdfFileReader(open(path_to_pdf, 'rb')) + return pdf.getNumPages() + + +def insert_lab_book_into_elasticsearch(es_clt: es.Elasticsearch, name: str, start_date: str, end_date: str, + num_pages: int): + document_id = "lab_book_" + name + + document = { + "name": name, + "start": start_date, + "end": end_date, + "page_count": num_pages, + } + + es_clt.index(index=LAB_BOOKS_INDEX, id=document_id, body=document) + + +def import_lab_books(es_clt: es.Elasticsearch, directory: str): + for entry in os.listdir(directory): + path = os.path.join(directory, entry) + + if os.path.isdir(path) and (not entry.startswith(".")): + import_lab_books(es_clt, path) + + if os.path.isfile(path) and entry.endswith(".pdf"): + # Remove the .pdf ending + filename = entry.split(".")[0] + + try: + (start, end) = extract_dates_from_filename(filename) + num_pages = get_num_pages_of_pdf(path) + + insert_lab_book_into_elasticsearch(es_clt, filename, start, end, num_pages) + + except ValueError as e: + # There was probably something wrong with the filename + print("Failed: ", path) + + except Exception as e: + # Something else happened... + print("Failed: ", path) + raise e + + +if __name__ == "__main__": + + if len(sys.argv) != 3: + exit_with_error("Not enough command line parameters provided!\n" + \ + "(Example usage: ./import_lab_books.py localhost:9200 ~/orchive3/data/labbooks) ") + + host = sys.argv[1] + lab_books_dir = sys.argv[2] + + es_clt = get_es_client(host) + import_lab_books(es_clt, lab_books_dir) diff --git a/scripts/ingest/requirements.txt b/scripts/ingest/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee3f4af4bd32cf9bbd8762b800921a9b96e39d55 --- /dev/null +++ b/scripts/ingest/requirements.txt @@ -0,0 +1,2 @@ +elasticsearch==7.12.1 +PyPDF2==1.26.0 \ No newline at end of file diff --git a/scripts/ingest/utils.py b/scripts/ingest/utils.py new file mode 100755 index 0000000000000000000000000000000000000000..6ad2ce5241e33d1cac369b5adc8c7606eff9da56 --- /dev/null +++ b/scripts/ingest/utils.py @@ -0,0 +1,61 @@ +import os +from typing import Tuple + +import elasticsearch as es +import sys + +from constants import * + + +def get_es_client(es_host: str) -> es.Elasticsearch: + return es.Elasticsearch([es_host], + use_ssl=False, + verify_certs=False, + ssl_show_warn=False) + + +def get_es_tape(es_clt: es.Elasticsearch, tape_name: str, year: int) -> Tuple[str, dict]: + unique_tape_name = tape_name + + query = { + "query": { + "bool": { + "must": [ + {"match": {"name": unique_tape_name}}, + {"match": {"year": year}} + ] + } + } + } + + result = es_clt.search(index=AUDIO_TAPES_INDEX, body=query) + + if result['hits']['total']['value'] != 1: + return None, None + + return result['hits']['hits'][0]['_id'], result['hits']['hits'][0]['_source'] + + +def get_lab_book(es_clt: es.Elasticsearch, lab_book_name: str): + + query = { + "query": { + "match": { + "name": { + "query": lab_book_name + } + } + } + } + + result = es_clt.search(index=LAB_BOOKS_INDEX, body=query) + + if len(result["hits"]["hits"]) == 1: + return result["hits"]["hits"][0] + + return None + + +def exit_with_error(message: str): + sys.stderr.write(message + "\r\n") + sys.exit(os.EX_USAGE) diff --git a/scripts/ocr/README.md b/scripts/ocr/README.md new file mode 100644 index 0000000000000000000000000000000000000000..37d6c086c2d8e2f6bfd770a139cdb8d5731ea68a --- /dev/null +++ b/scripts/ocr/README.md @@ -0,0 +1,50 @@ +# OCR for detecting lab book entries + +This directory contains scripts for preparing the lab books for OCR text +detection and for interpreting the returned results of such a OCR service. + +## Description of `preprocess_lab_books.py` + +This scripts takes the pdf of a scanned lab book and exports an `.jpg` for each +page of the lab book. It generates for every pdf page two images: +`<page number>_left.jpg` and `<page number>_right.jpg`. This results in one +image for every page of the actual lab book in its paper form. + +### Parameters + +`./preprocess_lab_books.py <path to pdf> <output directory>` + +| Parameter Name | Explanation | +| -------------------- | ------------------------------------------------------------ | +| `<path to pdf>` | The path to the pdf file of a lab book | +| `<output directory>` | The path to the directory in which the images will be stored | + +### Usage example + +`./preprocess_lab_books.py ~/orchive3/general/deployment/files/raw/labbooks/1985/feb1585_jul0586.pdf ~/orchive3/general/deployment/files/raw/labbook-images/1985/feb1585_jul0586` + +## Description of `interpret_ocr_result.py` + +In order to not type off a lab book to generate test data we have used an "out +of the box" ocr service in oder to get these data easily. In this case the +[Google Vision AI](https://cloud.google.com/vision/docs/drag-and-drop) service +was used. This script does not use the Google service directly, but it +interprets the `json`-data that was generated by the demo page of Vision AI. The +results of the service were stored with the same naming convention that gets +used by the `preprocess_lab_books.py` script with the only difference that the +files have a `.json` extension instead of `.jpg`. + +### Parameters + +`./interpret_ocr_result.py <lab book name> <json path> <img path> <output file>` + +| Parameter Name | Explanation | +| ----------------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `<lab book name>` | The name of the lab book that is also stored in ElasticSearch. This is needed to reference the entries to the actual lab book. | +| `<json path>` | The path to the json files containing the ocr results. | +| `<img path>` | The path to the directory containing the source images. This is needed for calculating the relative position of an entry | +| `<output file>` | The path to the output file to which the interpreted findings will be written | + +### Usage example + +`./preprocess_lab_books.py "feb1585_jul0586" ~orchive3/general/example-data/aug2591_sep1191_ocr_raw/aug2591_sep1191/json ~orchive3/general/example-data/aug2591_sep1191_ocr_raw/aug2591_sep1191/imgs ~/orchive3/general/deployment/files/raw/lab_book_entries.tsv` diff --git a/scripts/ocr/interpret_ocr_results.py b/scripts/ocr/interpret_ocr_results.py new file mode 100755 index 0000000000000000000000000000000000000000..1a97d5ee4eea6bd01eeb7fc04fc4c4757fdc5f62 --- /dev/null +++ b/scripts/ocr/interpret_ocr_results.py @@ -0,0 +1,432 @@ +#!/usr/bin/python3 + +import os +import sys +from datetime import date + +import regex +from PIL import Image +import json + + +class OCRDate: + def __init__(self, date, x, y, w, h): + self.date = date + self.x = x + self.y = y + self.w = w + self.h = h + self.page = 0 + self.rel_page_pos = 0 + +class OCRTapeRef: + def __init__(self, tape, x, y, w, h): + self.tape = tape + self.x = x + self.y = y + self.w = w + self.h = h + self.page = 0 + self.rel_page_pos = 0 + +class OCREntry: + def __init__(self, text, x, y, w, h): + self.text = text + self.x = x + self.y = y + self.w = w + self.h = h + self.page = 0 + self.rel_page_pos = 0 + + +def parse_date_string(date_string: str) -> date: + months_lut = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, + "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12} + + date_string = date_string.strip() + l = len(date_string) + + month_str = "" + day_str = "" + year_str = "" + + # Format mmmDyy or mmDyyyy + if l == 6 or l == 8: + month_str = date_string[:3] + day_str = date_string[3] + year_str = date_string[4:] + + # Format mmmDDyy or mmDDyyyy + elif l == 7 or l == 9: + month_str = date_string[:3] + day_str = date_string[3:5] + year_str = date_string[5:] + + # Unsupported Format + else: + raise ValueError() + + # Convert the date string into a number + if not month_str.lower() in months_lut.keys(): + raise ValueError("Can't determin month.") + + month = months_lut[month_str.lower()] + + day = int(day_str) + year = int(year_str) + + # Convert two digit years into four digit years + if year < 100: + if year > 50: + year += 1900 + else: + year += 2000 + + # Generate a yyyy-MM-dd string + return date.fromisoformat("{:04d}-{:02d}-{:02d}".format(year, month, day)) + + +def calculate_bounding_box(ocr_data, start_token_index, stop_token_index): + xs = [] + ys = [] + + for i in range(start_token_index, stop_token_index + 1): + annotation = ocr_data["textAnnotations"][i] + num_vertices = len(annotation["boundingPoly"]["vertices"]) + for j in range(0, num_vertices): + vert = annotation["boundingPoly"]["vertices"][j] + xs.append(vert["x"]) + ys.append(vert["y"]) + + bbox_x = max(0, min(xs)) + bbox_y = max(0, min(ys)) + + bbox_w = max(xs) - bbox_x + bbox_h = max(ys) - bbox_y + + return bbox_x, bbox_y, bbox_w, bbox_h + + +def find_ocr_stuff_helper(ocr_data, min_window_size, max_window_size, regex_str, mapper) -> list[any]: + detected_items = [] + tape_regex = regex.compile(regex_str) + + for i in range(1, len(ocr_data["textAnnotations"])): + # search for dates + + window = ocr_data["textAnnotations"][i]["description"] + window_start = i + window_stop = i + window_size = len(window) + + # concat the tokens until there is a big enough window for the regex + if window_size < min_window_size: + for j in range(i + 1, len(ocr_data["textAnnotations"])): + window_stop = j + window += ocr_data["textAnnotations"][j]["description"] + window_size = len(window) + if window_size >= min_window_size: + break + + for _ in range(i, len(ocr_data["textAnnotations"])): + # check for match in current window: + re_match = tape_regex.match(window) + if re_match: + ret = mapper(re_match, window_start, window_stop) + if ret: + detected_items.append(ret) + break + else: + if window_size > max_window_size or window_stop == len(ocr_data["textAnnotations"]) - 1: + break + else: + window_stop += 1 + window += ocr_data["textAnnotations"][window_stop]["description"] + window_size = len(window) + + return detected_items + + +def find_dates(ocr_data, lab_book_start_date, lab_book_stop_date): + + def mapper(re_match, window_start, window_stop): + match_str = re_match.groups()[0] + date_parts = regex.split("[.:,;\-_/\\()]", match_str) + + month = int(date_parts[0]) + day = int(date_parts[1]) + year = int(date_parts[2]) + + # we need a 4 digit year for the conversion to iso + if 99 < year < 1980: + # three digits or before 1980 is invalid + return None + elif 0 <= year <= 99: + if year > 80: + year += 1900 + else: + year += 2000 + + try: + decoded_date = date.fromisoformat("%d-%02d-%02d" % (year, month, day)) + + # on last validity check + if lab_book_start_date <= decoded_date <= lab_book_stop_date: + # store the date + bbox_x, bbox_y, bbox_w, bbox_h = calculate_bounding_box(ocr_data, window_start, window_stop) + detected_date = OCRDate(decoded_date, bbox_x, bbox_y, bbox_w, bbox_h) + return detected_date + except ValueError: + pass + + return None + + return find_ocr_stuff_helper(ocr_data, 6, 10, "(\d?\d[.:,;\-_/\\()]\d?\d[.:,;\-_/\\()]\d?\d?\d\d)$", mapper) + + +def find_tapes(ocr_data) -> list[OCRTapeRef]: + + def map_tape_side_letter(tape_name): + a_alternatives = ["A", "G", "H", "I", "J", "K", "L", "M", "N", "T", "x", "Y" ] + side_letter = tape_name[-1] + if side_letter in a_alternatives: + tape_name = str(tape_name[:-1]) + "A" + else: + tape_name = str(tape_name[:-1]) + "B" + + return tape_name + + def mapper1(re_match, window_start, window_stop): + match_str = re_match.groups()[0] + bbox_x, bbox_y, bbox_w, bbox_h = calculate_bounding_box(ocr_data, window_start, window_stop) + + tape_name = regex.sub("[#\-_,;.:\s]+", "", match_str).upper() + tape_name = map_tape_side_letter(tape_name) + + if len(tape_name) == 2: + tape_name = "00" + tape_name + if len(tape_name) == 3: + tape_name = "0" + tape_name + + ocr_tape_ref = OCRTapeRef(tape_name, bbox_x, bbox_y, bbox_w, bbox_h) + return ocr_tape_ref + + def mapper2(re_match, window_start, window_stop): + match_str = re_match.groups()[0] + bbox_x, bbox_y, bbox_w, bbox_h = calculate_bounding_box(ocr_data, window_start, window_stop) + + tape_name = match_str.lower() + tape_name = tape_name.replace("tape", "").replace("side", "") + tape_name = map_tape_side_letter(tape_name) + tape_name = regex.sub("[#\-_,;.:\s]+", "", tape_name).upper() + + if len(tape_name) == 2: + tape_name = "00" + tape_name + if len(tape_name) == 3: + tape_name = "0" + tape_name + + ocr_tape_ref = OCRTapeRef(tape_name, bbox_x, bbox_y, bbox_w, bbox_h) + return ocr_tape_ref + + tapes1 = find_ocr_stuff_helper(ocr_data, 3, 8, "(#\d{1,3}[A-Za-z])", mapper1) + tapes2 = find_ocr_stuff_helper(ocr_data, 3, 8, "(#\d{1,3}[\-_,;.:][A-Za-z])", mapper1) + + tapes3 = find_ocr_stuff_helper(ocr_data, 12, 18, "(tape\s*\d{1,3}\s*[\-_,;.:]\s*side\s*[A-Za-z])", mapper2) + tapes4 = find_ocr_stuff_helper(ocr_data, 6, 15, "(tape\s*\d{1,3}\s*[\-_,;.:]\s*[A-Za-z])", mapper2) + + return tapes1 + tapes2 + tapes3 + tapes4 + + +def find_entries(ocr_data): + ocr_entries = [] + + full_text = ocr_data["fullTextAnnotation"]["text"] + tokens = ocr_data["textAnnotations"] # [window_stop]["description"] + lines = full_text.split("\n") + + entry = "" + entries = [] + for line in lines: + if len(line) == 0: + continue + + if str.isdigit(line[0]): + if len(entry) > 0: + entries.append(entry.strip()) + entry = line + else: + entry += "\n" + line + + if len(entry) > 0: + entries.append(entry.strip()) + + token_ctr = 1 + + for i in range(0, len(entries)): + entry = entries[i] + start_token = token_ctr + text = entry.replace(" ", "") + text = text.replace("\n", "") + + combined_tokens = "" + + for j in range(token_ctr, len(tokens)): + if len(text) > len(combined_tokens): + combined_tokens += tokens[j]["description"] + else: + token_ctr = j + break + + # backtrack if we went to far due to missing special characters + for k in range(token_ctr, 0): + if text.endswith(tokens[k]["description"]): + token_ctr = k + 1 + break + + if i == len(entries) - 1: + token_ctr = len(tokens) + + bbox_x, bbox_y, bbox_w, bbox_h = calculate_bounding_box(ocr_data, start_token, max(start_token, token_ctr - 1)) + ocr_entries.append(OCREntry(entry, bbox_x, bbox_y, bbox_w, bbox_h)) + + return ocr_entries + + +def postprocess_detected_items(detected_items, page, is_right_page, img_width, img_height): + + for detected_item in detected_items: + detected_item.x = detected_item.x / img_width + detected_item.w = detected_item.w / img_width + detected_item.y = detected_item.y / img_height + detected_item.h = detected_item.h / img_height + + if isinstance(detected_item, OCRDate): + # Use the top edge of a date as the relative position to make sure that detected stuff in the same line + # gets assigned to this date. + detected_item.rel_page_pos = detected_item.y / 2.0 + else: + detected_item.rel_page_pos = (detected_item.y + detected_item.h) / 2.0 + + if is_right_page: + detected_item.x += 0.5 + detected_item.rel_page_pos += 0.5 + + detected_item.page = page + + +def find_appropriate_in_lut(date_lut, item_position): + date = None + + for date_lut_entry in date_lut: + if date_lut_entry["pos"] > item_position: + break + date = date_lut_entry["date"] + + return date + + +def get_lab_book_entry_for_tsv(detected_thing, date_lut): + tsv_entry = {} + + if isinstance(detected_thing, OCRDate): + tsv_entry["type"] = "date" + tsv_entry["value"] = detected_thing.date.strftime("%Y-%m-%d") + elif isinstance(detected_thing, OCRTapeRef): + tsv_entry["type"] = "tape_reference" + tsv_entry["value"] = detected_thing.tape + elif isinstance(detected_thing, OCREntry): + tsv_entry["type"] = "observation" + tsv_entry["value"] = detected_thing.text + + tsv_entry["bounding_box"] = {} + tsv_entry["bounding_box"]["x"] = detected_thing.x + tsv_entry["bounding_box"]["y"] = detected_thing.y + tsv_entry["bounding_box"]["w"] = detected_thing.w + tsv_entry["bounding_box"]["h"] = detected_thing.h + tsv_entry["page"] = detected_thing.page + tsv_entry["position"] = detected_thing.page + detected_thing.rel_page_pos + + date = find_appropriate_in_lut(date_lut, detected_thing.rel_page_pos) + tsv_entry["search_timeline_start"] = date + tsv_entry["search_timeline_end"] = date + + return tsv_entry + + +def process_page(ocr_data, page_number, is_right_page, img_width, img_height, lab_book_start_date, lab_book_end_date, last_date): + detected_dates = find_dates(ocr_data, lab_book_start_date, lab_book_end_date) + detected_tapes = find_tapes(ocr_data) + detected_entries = find_entries(ocr_data) + + postprocess_detected_items(detected_dates, page_number, is_right_page, img_width, img_height) + postprocess_detected_items(detected_tapes, page_number, is_right_page, img_width, img_height) + postprocess_detected_items(detected_entries, page_number, is_right_page, img_width, img_height) + + date_lut = [{"pos": 0.0, "date": last_date}] + for detected_date in detected_dates: + date_lut.append({ "pos": detected_date.rel_page_pos, "date": detected_date.date}) + + date_lut = list(sorted(date_lut, key=lambda x: x["pos"])) + + + tsv_entries = [] + for detected_date in detected_dates: + tsv_entries.append(get_lab_book_entry_for_tsv(detected_date, date_lut)) + for detected_tape in detected_tapes: + tsv_entries.append(get_lab_book_entry_for_tsv(detected_tape, date_lut)) + for detected_entry in detected_entries: + tsv_entries.append(get_lab_book_entry_for_tsv(detected_entry, date_lut)) + + return tsv_entries, date_lut[-1]["date"] + + +if __name__ == "__main__": + + if len(sys.argv) != 5: + print("Not enough command line parameters provided!\r\n" + + "(Example usage: ./interpret_ocr_result_MKII.py <lab_book_name> <json path> <img path> <output_file>)\r\n") + sys.exit(os.EX_USAGE) + + lab_book_name = sys.argv[1] + json_doc_path = sys.argv[2] + img_path = sys.argv[3] + output_file = open(sys.argv[4], "w") + + labbook_start_end_date = lab_book_name.split("_") + start_date = parse_date_string(labbook_start_end_date[0]) + end_date = parse_date_string(labbook_start_end_date[1]) + + last_date = start_date + + for json_doc_name in sorted(os.listdir(json_doc_path)): + if not json_doc_name.endswith(".json"): + continue + + try: + print(json_doc_name) + + cur_json_doc_path = os.path.join(json_doc_path, json_doc_name) + json_doc = json.load(open(cur_json_doc_path)) + + ref = os.path.basename(cur_json_doc_path).split(".")[0] + refs = ref.split("_") + page_num = int(refs[0]) + is_right_page = refs[1].lower() == "right" + + img = Image.open(os.path.join(img_path, ref + ".jpg")) + page_width, page_height = img.size + + # The original page image is twice as wide, because one image contains two pages + page_width *= 2 + + entries, last_date = process_page(json_doc, page_num, is_right_page, page_width, page_height, + start_date, end_date, last_date) + + for entry in entries: + output_file.write(lab_book_name + "\t" + json.dumps(entry, sort_keys=True, default=str ) + "\r\n") + except Exception as e: + print("Failed") + print(e) + + output_file.close() \ No newline at end of file diff --git a/scripts/ocr/ocr.py b/scripts/ocr/ocr.py new file mode 100644 index 0000000000000000000000000000000000000000..aebae05a0742788a27c485bb90be309e9113130d --- /dev/null +++ b/scripts/ocr/ocr.py @@ -0,0 +1,57 @@ +import io +import os +import json + +import sys + +# Imports the Google Cloud client library +from google.cloud import vision +from google.cloud.vision import AnnotateImageResponse + + +# Importing an auth key: https://cloud.google.com/vision/docs/libraries + +def text_detection(vision_ai_client, image_path, result_file_path): + + # The name of the image file to annotate + file_name = os.path.abspath(image_path) + + # Loads the image into memory + with io.open(file_name, 'rb') as image_file: + content = image_file.read() + + image = vision.Image(content=content) + + # Performs label detection on the image file + response = vision_ai_client.document_text_detection(image=image) + labels = response.label_annotations + + # Convert to json + # WTF... https://stackoverflow.com/questions/48623615/convert-google-vision-api-response-to-json + serialized_proto_plus = AnnotateImageResponse.serialize(response) + deserialized_proto_plus = AnnotateImageResponse.deserialize(serialized_proto_plus) + + response_json = AnnotateImageResponse.to_json(deserialized_proto_plus) + json_data = json.loads(response_json) + + with open(result_file_path, 'w') as f: + json.dump(json_data, f) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Not enough command line parameters provided!\r\n" + + "(Example usage: ./ocr.py <path to image> <output directory>)\r\n") + sys.exit(os.EX_USAGE) + + path_to_image = sys.argv[1] + output_dir_path = sys.argv[2] + output_file_name = output_dir_path + "/" + (".".join(path_to_image.split("/")[-1].split(".")[0:-1]) + ".json") + + # Instantiates a client + client = vision.ImageAnnotatorClient() + + # ocr + text_detection(client, path_to_image, output_file_name) + + diff --git a/scripts/ocr/preprocess_lab_books.py b/scripts/ocr/preprocess_lab_books.py new file mode 100755 index 0000000000000000000000000000000000000000..8f740d27274dee2fc810b82394fcf3edc5722266 --- /dev/null +++ b/scripts/ocr/preprocess_lab_books.py @@ -0,0 +1,57 @@ +#!/usr/bin/python3 +import io +import os +import sys + +import fitz +from PIL import Image +import PyPDF2 + + +def get_num_pages_of_pdf(pdf) -> int: + return pdf.getNumPages() + + +def get_all_lab_book_pages(path_to_pdf): + pdf = PyPDF2.PdfFileReader(open(path_to_pdf, "rb")) + num_pages = get_num_pages_of_pdf(pdf) + + doc = fitz.open(path_to_pdf) + + pages = [] + + for page_n in range(0, num_pages): + page = doc.loadPage(page_n) + pix = page.getPixmap() + + w = pix.width + h = pix.height + + # convert to a PIL image + img = Image.frombytes("RGB", [w, h], pix.samples) + left_page = img.crop((0, 0, w / 2, h)) + right_page = img.crop((w / 2, 0, w, h)) + + pages.append((img, left_page, right_page)) + + return pages + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Not enough command line parameters provided!\r\n" + + "(Example usage: ./preprocess_lab_books.py <path to pdf> <output directory>)\r\n") + sys.exit(os.EX_USAGE) + + path_to_pdf = sys.argv[1] + output_path = sys.argv[2] + + print(path_to_pdf) + + pages = get_all_lab_book_pages(path_to_pdf) + + page_n = 1 + for page in pages: + page[1].save(output_path + "/" + str(page_n) + "_left.jpg") + page[2].save(output_path + "/" + str(page_n) + "_right.jpg") + page_n += 1 diff --git a/scripts/preprocessing/README.md b/scripts/preprocessing/README.md new file mode 100644 index 0000000000000000000000000000000000000000..43505555475111088a3901d1d2fbdbfbdaa8db3c --- /dev/null +++ b/scripts/preprocessing/README.md @@ -0,0 +1,110 @@ +# Preprocessing Scripts + +## Labbooks + +All labbook preprocessing scripts expect the file structure shown here as input. + +```text +<input directory> +┣━ 1985 +┃ ┣━ feb1585_jul0586.pdf +┃ ┗━ ... +┣━ 1986 +┃ ┗━ ... +┗━ ... +``` + +The first step is to execute the `correct_labbook_year_mapping.py` script, if you have downloaded all the labbooks from +the original orchive. This is mandatory because there are some labbooks that don't match the year of the directory +they are stored in. This script ensures that all pdf files are stored in the directory correct for the year in which +they begin. + +Example: + +```bash +./correct_labbook_year_mapping.py <input directory> +``` + +To convert the labbooks into the right format use the +`extract_pdf_pages_as_img.py` script. The resulting images will be stored in a directory structure that will look like +the following: + +```text +<output directory>/files/labbooks/ +┣━ 1985 +┃ ┣━ feb1585_jul0586 +┃ ┃ ┣━ 1.png +┃ ┃ ┣━ 2.png +┃ ┃ ┗━ ... +┃ ┗━ ... +┣━ 1986 +┃ ┗━ ... +┗━ ... +``` + +### Required dependencies for `extract_pdf_pages_as_img.py` + +- PyMuPDF 1.18.15 +- Pillow 8.0.1 + +### Usage of `extract_pdf_pages_as_img.py` + +```bash +./extract_pdf_pages_as_img.py <input directory> <output directory> +``` + +Example: + +```bash +./extract_pdf_pages_as_img.py ~/orchive3/general/files/raw/labbooks/ ~/orchive3/general/ +``` + +## Tapes + +Each channel of the audio tapes has be stored as a separate audio file in order +to be easily accessible for the frontend. The script +`split_tape_into_channels.sh` converts stereo mp3s into mono files. Like the +preprocessing script for the images, the audio tapes have to be stored in a +specific directory structure: + +```text +<input directory> +┣━ 1985 +┃ ┣━ 001A.mp3 +┃ ┣━ 001B.mp3 +┃ ┗━ ... +┣━ 1986 +┃ ┗━ ... +┗━ ... +``` + +The resulting mono audio files will be stored in the following directory +structure: + +```text +<output directory>/files/tapes/ +┣━ 1985 +┃ ┣━ 001A +┃ ┃ ┣━ left.mp3 +┃ ┃ ┗━ right.mp3 +┃ ┗━ ... +┣━ 1986 +┃ ┗━ ... +┗━ ... +``` + +### Required dependencies for `split_tape_into_channels.sh` + +- sox + +### Usage of `split_tape_into_channels.sh` + +```bash +./split_tape_into_channels.sh <input directory> <output directory> +``` + +Example: + +```bash +./split_tape_into_channels.sh ~/orchive3/general/files/raw/tapes/ ~/orchive3/general/ +``` diff --git a/scripts/preprocessing/correct_labbook_year_mapping.py b/scripts/preprocessing/correct_labbook_year_mapping.py new file mode 100755 index 0000000000000000000000000000000000000000..b79f825a2caf0302b61a7caed3f08bf03df4a9e9 --- /dev/null +++ b/scripts/preprocessing/correct_labbook_year_mapping.py @@ -0,0 +1,89 @@ +#!/usr/bin/python3 + +import os +import sys +import shutil + +def get_year_from_date_string(date_string: str) -> int: + date_string = date_string.strip() + l = len(date_string) + + # Format mmmDyy or mmDyyyy + if l == 6 or l == 8: + year_str = date_string[4:] + # Format mmmDDyy or mmDDyyyy + elif l == 7 or l == 9: + year_str = date_string[5:] + # Unsupported Format + else: + raise ValueError("Unsupported date format: %s" %date_string) + + + year = int(year_str) + + # Convert two digit years into four digit years + if year < 100: + if year > 50: + year += 1900 + else: + year += 2000 + + return year + + +def get_start_year_from_filename(filename: str): + # remove .pdf ending + name = filename.split(".")[0] + + date_strings = name.split("_") + return get_year_from_date_string(date_strings[0]) + + +def move_lab_book(base_path: str, src_year: int, lab_book: str, dst_year: int) -> None: + + dst_path = os.path.join(base_path, str(dst_year)) + if not os.path.exists(dst_path): + os.mkdir(dst_path) + if not os.path.isdir(dst_path): + raise ValueError("%s isn't a directory" % str(dst_path)) + + dst_path = os.path.join(dst_path, lab_book) + if os.path.exists(dst_path): + raise ValueError("%s already exists" % str(dst_path)) + + shutil.move(os.path.join(base_path, str(src_year), lab_book), dst_path) + + +def correct_lab_book_year_mapping(base_dir: str) -> None: + + years = filter(lambda x: os.path.isdir(os.path.join(base_dir, x)) and x.isnumeric(), os.listdir(base_dir)) + + for year in years: + path_year = os.path.join(base_dir, year) + lab_books = filter(lambda x: os.path.isfile(os.path.join(path_year, x)) and x.endswith(".pdf"), + os.listdir(path_year)) + + for lab_book in lab_books: + try: + start_year = get_start_year_from_filename(lab_book) + + if start_year != int(year): + move_lab_book(base_dir, year, lab_book, start_year) + print("%s : %d -> %s" % (lab_book, start_year, year)) + except Exception as e: + print("%s/%s failed: %s" % (year, lab_book, str(e))) + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Not enough command line parameters provided!\r\n" + + "(Example usage: ./correct_labbook_year_mapping.py <path to dir containing the year dirs with the lab books>!)\r\n") + sys.exit(os.EX_USAGE) + + work_dir = sys.argv[1] + + if not os.path.exists(work_dir): + print("%s doesn't exist" % work_dir) + sys.exit(os.EX_USAGE) + + correct_lab_book_year_mapping(work_dir) \ No newline at end of file diff --git a/scripts/preprocessing/extract_pdf_pages_as_img.py b/scripts/preprocessing/extract_pdf_pages_as_img.py new file mode 100755 index 0000000000000000000000000000000000000000..06a9c72aafa5a04fbfef1b30435b0cf8fecf3f18 --- /dev/null +++ b/scripts/preprocessing/extract_pdf_pages_as_img.py @@ -0,0 +1,59 @@ +#!/usr/bin/python3 +import io +import os +import sys + +import fitz +from PIL import Image + + +def export_all_lab_book_pages(path_to_pdf, lab_book_pic_dir): + + doc = fitz.open(path_to_pdf) + num_pages = doc.page_count + + pages = [] + + for page_n in range(0, num_pages): + page = doc.loadPage(page_n) + pix = page.getPixmap() + + w = pix.width + h = pix.height + + # convert to a PIL image + img = Image.frombytes("RGB", [w, h], pix.samples) + img.save(lab_book_pic_dir + "/" + str(page_n + 1) + ".jpg") + + return pages + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Not enough command line parameters provided!\r\n" + + "(Example usage: ./extract_pdf_pages_as_img.py <path to pdf> <output directory>)\r\n") + sys.exit(os.EX_USAGE) + + path_to_orchive_pdfs = sys.argv[1] + output_base_path = sys.argv[2] + + for year_dir in os.listdir(path_to_orchive_pdfs): + if not year_dir.isdigit(): + continue + + year = int(year_dir) + + for pdf in os.listdir(os.path.join(path_to_orchive_pdfs, year_dir)): + if not pdf.lower().endswith(".pdf"): + continue + + pdf_path = os.path.join(path_to_orchive_pdfs, year_dir, pdf) + labbook_name = ".".join(pdf.split(".")[0:-1]) + + pic_dir = os.path.join(output_base_path, "files", "labbooks", year_dir, labbook_name) + + # create the directory structure + if not os.path.exists(pic_dir): + os.makedirs(pic_dir) + + export_all_lab_book_pages(pdf_path, pic_dir) diff --git a/scripts/preprocessing/split_channels.sh b/scripts/preprocessing/split_channels.sh new file mode 100755 index 0000000000000000000000000000000000000000..8c0ffd073131c3ff56f5060808f199246a333365 --- /dev/null +++ b/scripts/preprocessing/split_channels.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +NUM_PARALLEL_TASKS=8 # the script will use NUM_PARALLEL_TASKS threads on the system + +if [ $# != 2 ] ; then + echo "Not enough command line parameters provided!" + echo "(Example usage: ./split_channels.sh <path to audio files> <output directory>)" + exit -1 +fi + +input_dir=$(realpath "$1") +output_base_path=$(realpath "$2") + +for year_dir in ${input_dir}/*/ ; do + year=$(basename "$year_dir") + + for mp3_file in ${year_dir}/*.mp3 ; do + + tape=$(basename $mp3_file) + dest_dir="$output_base_path/files/tapes/$year/${tape%.mp3}/" + + # create the output directory if it doesn't exist + mkdir -p "$dest_dir" + + # limit the number of parallel running tasks + # https://stackoverflow.com/questions/49823080/use-bash-wait-in-for-loop + if [ $(jobs -r | wc -l) -ge $NUM_PARALLEL_TASKS ]; then + wait $(jobs -r -p | head -1) + fi + # start the first task for the current mp3 + sox $mp3_file "$dest_dir/left.mp3" remix 1 & + + # launch the second task for the current mp3 + if [ $(jobs -r | wc -l) -ge $NUM_PARALLEL_TASKS ]; then + wait $(jobs -r -p | head -1) + fi + sox $mp3_file "$dest_dir/right.mp3" remix 2 & + + done +done