diff --git a/command_history.txt b/command_history.txt new file mode 100644 index 0000000..98886a6 --- /dev/null +++ b/command_history.txt @@ -0,0 +1,47 @@ + 1 git clone https://github.com/sanchit-gandhi/whisper-jax.git + 2 cd whisper-jax + 3 pip install -r requirements.txt + 4 pip install numpy scipy jax jaxlib flax optax transformers + 5 cd .. + 6 git clone https://github.com/davabase/whisper_real_time.git + 7 cd whisper_real_time/ + 8 pip install -r requirements.txt + 9 sudo apt-get install -y portaudio19-dev + 10 pip install pyaudio + 11 pip install -r requirements.txt + 12 python3.7 transcribe_demo.py + 13 python transcribe_demo.py + 14 cd .. + 15 git clone https://github.com/openai/whisper.git + 16 cd whisper + 17 pip install -U openai-whisper + 18 pip install --upgrade --no-deps --force-reinstall git+https://github.com/openai/whisper.git + 19 sudo apt update && sudo apt install ffmpeg + 20 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh + 21 pip install setuptools-rust + 22 whisper audio.flac audio.mp3 audio.wav --model medium + 23 audio.flac: No such file or directory + 24 whisper audio.mp3 --model medium + 25 mkdir whisper_api + 26 cd whisper + 27 cd .. + 28 cd whisper_api/ + 29 touch app.py + 30 mkdir uploads + 31 python app.py + 32 pip install flask flask-restful + 33 python app.py + 34 cd whisperX + 35 sudo apt-get update + 36 sudo apt-get install ffmpeg + 37 ffmpeg -v + 38 whisperx audio.mp3 --compute_type float32 + 39 cd .. + 40 cd whisper-jax/ + 41 pip install git+https://github.com/deepmind/dm-haiku + 42 pip install git+https://github.com/sanchit-gandhi/whisper-jax + 43 pip install git+https://github.com/deepmind/optax + 44 pip install -e .["endpoint"] + 45 python app/app.py + 46 history + 47 history > command_history.txt diff --git a/dockerize.txt b/dockerize.txt new file mode 100644 index 0000000..9f8c1fc --- /dev/null +++ b/dockerize.txt @@ -0,0 +1,76 @@ +To Dockerize this environment, you'll first need to create a Dockerfile. This file will include instructions on how to build a Docker image that encapsulates all of the requirements of your environment. + +Here's a simple Dockerfile to get you started: + +```dockerfile +# Use an official Python runtime as a parent image +FROM python:3.7-slim-buster + +# Update and install dependencies +RUN apt-get update && apt-get install -y \ + git \ + ffmpeg \ + portaudio19-dev \ + curl \ + build-essential + +# Install Rust +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + +# Set the working directory in the container +WORKDIR /app + +# Clone the whisper-jax repository +RUN git clone https://github.com/sanchit-gandhi/whisper-jax.git +WORKDIR /app/whisper-jax +RUN pip install -r requirements.txt + +# Clone the whisper_real_time repository +WORKDIR /app +RUN git clone https://github.com/davabase/whisper_real_time.git +WORKDIR /app/whisper_real_time/ +RUN pip install -r requirements.txt +RUN pip install pyaudio + +# Clone whisper repository +WORKDIR /app +RUN git clone https://github.com/openai/whisper.git +WORKDIR /app/whisper +RUN pip install -U openai-whisper +RUN pip install --upgrade --no-deps --force-reinstall git+https://github.com/openai/whisper.git + +# Setup whisper_api +WORKDIR /app/whisper_api/ +RUN mkdir uploads +RUN touch app.py +RUN pip install flask flask-restful + +# Install whisper-jax dependencies +WORKDIR /app/whisper-jax/ +RUN pip install git+https://github.com/deepmind/dm-haiku +RUN pip install git+https://github.com/sanchit-gandhi/whisper-jax +RUN pip install git+https://github.com/deepmind/optax +RUN pip install -e .["endpoint"] + +# Run the app when the container launches +CMD ["python", "app/app.py"] +``` + +Once you have the Dockerfile ready, you can build the Docker image by running: + +```bash +docker build -t your-docker-image-name . +``` + +This will take a while, as it has to clone repositories, install packages, etc. When it's done, you can create a container from the image and run it with: + +```bash +docker run -p 4000:4000 -t your-docker-image-name +``` + +Please modify the Dockerfile and docker run command according to your needs. For instance, if your flask application runs on a different port or if you need to expose more ports, add them in the docker run command. If you need more environment variables, you can use the ENV instruction in the Dockerfile or the -e option in the docker run command. + +You also need to take care about the persistent storage for your uploads folder and how you are going to handle the command line scripts like 'whisper audio.mp3 --model medium' or 'whisperx audio.mp3 --compute_type float32' etc. + +Remember, Docker containers are ephemeral. Any data that your application creates inside the container will be lost when the container is stopped and removed, unless it's tied to a persistent storage like a Docker volume. \ No newline at end of file diff --git a/script.sh b/script.sh new file mode 100644 index 0000000..44e8625 --- /dev/null +++ b/script.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +# Clone whisper-jax repository +git clone https://github.com/sanchit-gandhi/whisper-jax.git +cd whisper-jax +pip install -r requirements.txt +pip install numpy scipy jax jaxlib flax optax transformers +cd .. + +# Clone whisper_real_time repository +git clone https://github.com/davabase/whisper_real_time.git +cd whisper_real_time/ +pip install -r requirements.txt +sudo apt-get install -y portaudio19-dev +pip install pyaudio +pip install -r requirements.txt +python3.7 transcribe_demo.py +cd .. + +# Clone whisper repository +git clone https://github.com/openai/whisper.git +cd whisper +pip install -U openai-whisper +pip install --upgrade --no-deps --force-reinstall git+https://github.com/openai/whisper.git +sudo apt update && sudo apt install ffmpeg + +# Install Rust +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh + +pip install setuptools-rust + +# Convert audio files using whisper +whisper audio.mp3 --model medium + +# Set up whisper_api +cd .. +cd whisper_api/ +touch app.py +mkdir uploads +pip install flask flask-restful +python app.py + +# Install ffmpeg and verify installation +sudo apt-get update +sudo apt-get install ffmpeg +ffmpeg -v + +# Transcribe audio using whisperx +whisperx audio.mp3 --compute_type float32 + +# Install dependencies for whisper-jax +cd .. +cd whisper-jax/ +pip install git+https://github.com/deepmind/dm-haiku +pip install git+https://github.com/sanchit-gandhi/whisper-jax +pip install git+https://github.com/deepmind/optax +pip install -e .["endpoint"] + +# Run the Gradio app +python app/app.py + +# Save command history to a file +history > command_history.txt diff --git a/test.txt b/test.txt new file mode 100644 index 0000000..1f1d3b2 --- /dev/null +++ b/test.txt @@ -0,0 +1 @@ +czxcjghjghghjghjgs diff --git a/whisper-jax.zip b/whisper-jax.zip new file mode 100644 index 0000000..6b500d4 Binary files /dev/null and b/whisper-jax.zip differ diff --git a/whisper.zip b/whisper.zip new file mode 100644 index 0000000..7e202f1 Binary files /dev/null and b/whisper.zip differ diff --git a/whisperX.zip b/whisperX.zip new file mode 100644 index 0000000..3471641 Binary files /dev/null and b/whisperX.zip differ diff --git a/whisper_real_time/README.md b/whisper_real_time/README.md new file mode 100644 index 0000000..e8f6acc --- /dev/null +++ b/whisper_real_time/README.md @@ -0,0 +1,34 @@ +# Real Time Whisper Transcription + +![Demo gif](demo.gif) + +This is a demo of real time speech to text with OpenAI's Whisper model. It works by constantly recording audio in a thread and concatenating the raw bytes over multiple recordings. + +To install dependencies simply run +``` +pip install -r requirements.txt +``` +in an environment of your choosing. + +Whisper also requires the command-line tool [`ffmpeg`](https://ffmpeg.org/) to be installed on your system, which is available from most package managers: + +``` +# on Ubuntu or Debian +sudo apt update && sudo apt install ffmpeg + +# on Arch Linux +sudo pacman -S ffmpeg + +# on MacOS using Homebrew (https://brew.sh/) +brew install ffmpeg + +# on Windows using Chocolatey (https://chocolatey.org/) +choco install ffmpeg + +# on Windows using Scoop (https://scoop.sh/) +scoop install ffmpeg +``` + +For more information on Whisper please see https://github.com/openai/whisper + +The code in this repository is public domain. \ No newline at end of file diff --git a/whisper_real_time/requirements.txt b/whisper_real_time/requirements.txt new file mode 100644 index 0000000..ae172ba --- /dev/null +++ b/whisper_real_time/requirements.txt @@ -0,0 +1,5 @@ +pyaudio +SpeechRecognition +--extra-index-url https://download.pytorch.org/whl/cu116 +torch +git+https://github.com/openai/whisper.git \ No newline at end of file diff --git a/whisper_real_time/transcribe_demo.py b/whisper_real_time/transcribe_demo.py new file mode 100644 index 0000000..6dd8972 --- /dev/null +++ b/whisper_real_time/transcribe_demo.py @@ -0,0 +1,152 @@ +#! python3.7 + +import argparse +import io +import os +import speech_recognition as sr +import whisper +import torch + +from datetime import datetime, timedelta +from queue import Queue +from tempfile import NamedTemporaryFile +from time import sleep +from sys import platform + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", default="medium", help="Model to use", + choices=["tiny", "base", "small", "medium", "large"]) + parser.add_argument("--non_english", action='store_true', + help="Don't use the english model.") + parser.add_argument("--energy_threshold", default=1000, + help="Energy level for mic to detect.", type=int) + parser.add_argument("--record_timeout", default=2, + help="How real time the recording is in seconds.", type=float) + parser.add_argument("--phrase_timeout", default=3, + help="How much empty space between recordings before we " + "consider it a new line in the transcription.", type=float) + if 'linux' in platform: + parser.add_argument("--default_microphone", default='pulse', + help="Default microphone name for SpeechRecognition. " + "Run this with 'list' to view available Microphones.", type=str) + args = parser.parse_args() + + # The last time a recording was retreived from the queue. + phrase_time = None + # Current raw audio bytes. + last_sample = bytes() + # Thread safe Queue for passing data from the threaded recording callback. + data_queue = Queue() + # We use SpeechRecognizer to record our audio because it has a nice feauture where it can detect when speech ends. + recorder = sr.Recognizer() + recorder.energy_threshold = args.energy_threshold + # Definitely do this, dynamic energy compensation lowers the energy threshold dramtically to a point where the SpeechRecognizer never stops recording. + recorder.dynamic_energy_threshold = False + + # Important for linux users. + # Prevents permanent application hang and crash by using the wrong Microphone + if 'linux' in platform: + mic_name = args.default_microphone + if not mic_name or mic_name == 'list': + print("Available microphone devices are: ") + for index, name in enumerate(sr.Microphone.list_microphone_names()): + print(f"Microphone with name \"{name}\" found") + return + else: + for index, name in enumerate(sr.Microphone.list_microphone_names()): + if mic_name in name: + source = sr.Microphone(sample_rate=16000, device_index=index) + break + else: + source = sr.Microphone(sample_rate=16000) + + # Load / Download model + model = args.model + if args.model != "large" and not args.non_english: + model = model + ".en" + audio_model = whisper.load_model(model) + + record_timeout = args.record_timeout + phrase_timeout = args.phrase_timeout + + temp_file = NamedTemporaryFile().name + transcription = [''] + + with source: + recorder.adjust_for_ambient_noise(source) + + def record_callback(_, audio:sr.AudioData) -> None: + """ + Threaded callback function to recieve audio data when recordings finish. + audio: An AudioData containing the recorded bytes. + """ + # Grab the raw bytes and push it into the thread safe queue. + data = audio.get_raw_data() + data_queue.put(data) + + # Create a background thread that will pass us raw audio bytes. + # We could do this manually but SpeechRecognizer provides a nice helper. + recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout) + + # Cue the user that we're ready to go. + print("Model loaded.\n") + + while True: + try: + now = datetime.utcnow() + # Pull raw recorded audio from the queue. + if not data_queue.empty(): + phrase_complete = False + # If enough time has passed between recordings, consider the phrase complete. + # Clear the current working audio buffer to start over with the new data. + if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout): + last_sample = bytes() + phrase_complete = True + # This is the last time we received new audio data from the queue. + phrase_time = now + + # Concatenate our current audio data with the latest audio data. + while not data_queue.empty(): + data = data_queue.get() + last_sample += data + + # Use AudioData to convert the raw data to wav data. + audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH) + wav_data = io.BytesIO(audio_data.get_wav_data()) + + # Write wav data to the temporary file as bytes. + with open(temp_file, 'w+b') as f: + f.write(wav_data.read()) + + # Read the transcription. + result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available()) + text = result['text'].strip() + + # If we detected a pause between recordings, add a new item to our transcripion. + # Otherwise edit the existing one. + if phrase_complete: + transcription.append(text) + else: + transcription[-1] = text + + # Clear the console to reprint the updated transcription. + os.system('cls' if os.name=='nt' else 'clear') + for line in transcription: + print(line) + # Flush stdout. + print('', end='', flush=True) + + # Infinite loops are bad for processors, must sleep. + sleep(0.25) + except KeyboardInterrupt: + break + + print("\n\nTranscription:") + for line in transcription: + print(line) + + +if __name__ == "__main__": + main() \ No newline at end of file