Serverless GPU transcription with Modal
I quickly want to share how easy it is to set up a serverless GPU function with Modal.
Background: my girlfriend had recorded 3 hours of audio during an interview and I decided to help her out with some automatic audio transcription.
Things that I like about this setup:
- simple python script
- I can send my gf just a link to the service where she can upload her audio files and get back text
- since it's serverless, I don't have to worry about paying for idle time
- I can get around 30s limitations of free online services
- Modal gives you $30 free credits per month to play around with
imports
import os
import modal
from fastapi import HTTPException
from fastapi import UploadFile, File
from pydantic import BaseModelstub and cache creation
stub = modal.Stub("whisper")
stub.image = (
modal.Image.debian_slim(python_version="3.10")
.pip_install(
"torch",
"transformers",
)
.apt_install("ffmpeg")
.pip_install("ffmpeg-python")
)
CACHE_PATH = "/root/cache"
cache_volume = modal.SharedVolume().persist("whisper-cache")pipeline and inference logic
@stub.cls(
gpu="T4", # a 16 GB GPU will suffice
shared_volumes={CACHE_PATH: cache_volume}, # here we reference our cache volume
container_idle_timeout=120, # time to keep alive the container in between requests
concurrency_limit=1, # how many gpus may be provisioned at the same time
keep_warm=0, # how many containers to keep alive at all times
)
class WhisperPipeline:
def __enter__(self): # for container lifecycle mgmt
os.environ['TRANSFORMERS_CACHE'] = CACHE_PATH
# these imports will use the stub's image
import torch
from transformers import pipeline
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# by specifying chunk_length_s, we can throw in audio files
# of any length that fit in the GPU
self.pipe = pipeline(
"automatic-speech-recognition",
model="openai/whisper-tiny",
chunk_length_s=30,
device=device,
)
@modal.method()
def transcribe(self, file: bytes) -> str:
return self.pipe(file)["text"]web endpoint
class TranscribeResponse(BaseModel):
prediction: str
@stub.function()
@modal.web_endpoint(method="POST")
async def entrypoint(file: UploadFile = File(...)):
try:
whisper_pipe = WhisperPipeline()
prediction = whisper_pipe.transcribe.call(file=file.file.read())
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
return TranscribeResponse(prediction=prediction)You can either serve this temporarily with $ modal serve app.py or deploy it to the cloud with $ modal deploy app.py.
Finally, go to the deployment url and add /docs to see the swagger API and try it out!