Run local model: Image 2 Text and Text 2 Speech Models with Streamlit based UI

Table of Contents

Setup Overview
#

My setup consists of an NVIDIA GTX 1060 GPU with CachyOS Linux.

Python Prerequisites
#

Create Python Venv
#

CachyOS uses fish as the default shell, adapt activate.fish if necessary.

# Create virtual environment
python3 -m venv .venv-hf-project-2

# Active environment
source .venv-hf-project-2/bin/activate.fish

# Upgrade pip
pip install --upgrade pip

# (Deactivate venv)
deactivate

Pip Requirements
#

Create Pinned Version List
#

Find CUDA version: https://download.pytorch.org/whl/

# List available versions
pip index versions torchaudio \
  --index-url https://download.pytorch.org/whl/cu118

# Shell output:
torch (2.7.1+cu118)
Available versions: 2.7.1+cu118, 2.7.0+cu118, 2.6.0+cu118, 2.5.1+cu118, 2.5.0+cu118

Create a file with the intented packages:

requirements.in

--index-url https://download.pytorch.org/whl/cu118
--extra-index-url https://pypi.org/simple

torch==2.7.1+cu118
torchaudio==2.7.1+cu118

python-dotenv
pillow
streamlit
scipy

transformers
accelerate
datasets
huggingface_hub

# Install pip tools
pip install pip-tools

# Create a txt file with the exact pinned package and dependencies versions
pip-compile -o requirements.txt requirements.in

The pinned down requirements and their dependencies look like this:

requirements.txt

#
# This file is autogenerated by pip-compile with Python 3.13
# by the following command:
#
#    pip-compile --output-file=requirements.txt requirements.in
#
--index-url https://download.pytorch.org/whl/cu118
--extra-index-url https://pypi.org/simple

accelerate==1.12.0
    # via -r requirements.in
aiohappyeyeballs==2.6.1
    # via aiohttp
aiohttp==3.13.2
    # via fsspec
aiosignal==1.4.0
    # via aiohttp
altair==6.0.0
    # via streamlit
anyio==4.12.0
    # via httpx
attrs==25.4.0
    # via
    #   aiohttp
    #   jsonschema
    #   referencing
blinker==1.9.0
    # via streamlit
cachetools==6.2.2
    # via streamlit
certifi==2025.11.12
    # via
    #   httpcore
    #   httpx
    #   requests
charset-normalizer==3.4.4
    # via requests
click==8.3.1
    # via streamlit
datasets==4.4.1
    # via -r requirements.in
dill==0.4.0
    # via
    #   datasets
    #   multiprocess
filelock==3.20.0
    # via
    #   datasets
    #   huggingface-hub
    #   torch
    #   transformers
frozenlist==1.8.0
    # via
    #   aiohttp
    #   aiosignal
fsspec[http]==2025.10.0
    # via
    #   datasets
    #   huggingface-hub
    #   torch
gitdb==4.0.12
    # via gitpython
gitpython==3.1.45
    # via streamlit
h11==0.16.0
    # via httpcore
hf-xet==1.2.0
    # via huggingface-hub
httpcore==1.0.9
    # via httpx
httpx==0.28.1
    # via datasets
huggingface-hub==0.36.0
    # via
    #   -r requirements.in
    #   accelerate
    #   datasets
    #   tokenizers
    #   transformers
idna==3.11
    # via
    #   anyio
    #   httpx
    #   requests
    #   yarl
jinja2==3.1.6
    # via
    #   altair
    #   pydeck
    #   torch
jsonschema==4.25.1
    # via altair
jsonschema-specifications==2025.9.1
    # via jsonschema
markupsafe==3.0.3
    # via jinja2
mpmath==1.3.0
    # via sympy
multidict==6.7.0
    # via
    #   aiohttp
    #   yarl
multiprocess==0.70.18
    # via datasets
narwhals==2.13.0
    # via altair
networkx==3.6.1
    # via torch
numpy==2.3.5
    # via
    #   accelerate
    #   datasets
    #   pandas
    #   pydeck
    #   scipy
    #   streamlit
    #   transformers
nvidia-cublas-cu11==11.11.3.6
    # via
    #   nvidia-cudnn-cu11
    #   nvidia-cusolver-cu11
    #   torch
nvidia-cuda-cupti-cu11==11.8.87
    # via torch
nvidia-cuda-nvrtc-cu11==11.8.89
    # via torch
nvidia-cuda-runtime-cu11==11.8.89
    # via torch
nvidia-cudnn-cu11==9.1.0.70
    # via torch
nvidia-cufft-cu11==10.9.0.58
    # via torch
nvidia-curand-cu11==10.3.0.86
    # via torch
nvidia-cusolver-cu11==11.4.1.48
    # via torch
nvidia-cusparse-cu11==11.7.5.86
    # via torch
nvidia-nccl-cu11==2.21.5
    # via torch
nvidia-nvtx-cu11==11.8.86
    # via torch
packaging==25.0
    # via
    #   accelerate
    #   altair
    #   datasets
    #   huggingface-hub
    #   streamlit
    #   transformers
pandas==2.3.3
    # via
    #   datasets
    #   streamlit
pillow==12.0.0
    # via
    #   -r requirements.in
    #   streamlit
propcache==0.4.1
    # via
    #   aiohttp
    #   yarl
protobuf==6.33.2
    # via streamlit
psutil==7.1.3
    # via accelerate
pyarrow==22.0.0
    # via
    #   datasets
    #   streamlit
pydeck==0.9.1
    # via streamlit
python-dateutil==2.9.0.post0
    # via pandas
python-dotenv==1.2.1
    # via -r requirements.in
pytz==2025.2
    # via pandas
pyyaml==6.0.3
    # via
    #   accelerate
    #   datasets
    #   huggingface-hub
    #   transformers
referencing==0.37.0
    # via
    #   jsonschema
    #   jsonschema-specifications
regex==2025.11.3
    # via transformers
requests==2.32.5
    # via
    #   datasets
    #   huggingface-hub
    #   streamlit
    #   transformers
rpds-py==0.30.0
    # via
    #   jsonschema
    #   referencing
safetensors==0.7.0
    # via
    #   accelerate
    #   transformers
scipy==1.16.3
    # via -r requirements.in
six==1.17.0
    # via python-dateutil
smmap==5.0.2
    # via gitdb
streamlit==1.52.1
    # via -r requirements.in
sympy==1.14.0
    # via torch
tenacity==9.1.2
    # via streamlit
tokenizers==0.22.1
    # via transformers
toml==0.10.2
    # via streamlit
torch==2.7.1+cu118
    # via
    #   -r requirements.in
    #   accelerate
    #   torchaudio
torchaudio==2.7.1+cu118
    # via -r requirements.in
tornado==6.5.3
    # via streamlit
tqdm==4.67.1
    # via
    #   datasets
    #   huggingface-hub
    #   transformers
transformers==4.57.3
    # via -r requirements.in
triton==3.3.1
    # via torch
typing-extensions==4.15.0
    # via
    #   altair
    #   huggingface-hub
    #   streamlit
    #   torch
tzdata==2025.2
    # via pandas
urllib3==2.6.2
    # via requests
watchdog==6.0.0
    # via streamlit
xxhash==3.6.0
    # via datasets
yarl==1.22.0
    # via aiohttp

# The following packages are considered to be unsafe in a requirements file:
# setuptools

Insall Requirements
#

# Install pinned requirements and dependencies
pip install -r requirements.txt

Python App
#

Hugging Face Model
#

Model Link: https://huggingface.co/Salesforce/blip-image-captioning-base

File and Folder Structure
#

The file and folder structure of the project looks like this:

hf-project-2
├── app.py
├── caption.wav  # Created / overwritten by the text to audio model
├── .env
├── requirements.in
└── requirements.txt

.env File & HF API Token
#

Create Hugging Face API token:

Go to: “Settings” > “Access Tokens” > “Create new token”
Select “Read”
Click “Create token”

Add the token to the .env file:

.env

# Hugging Face read token
HUGGINGFACEHUB_API_TOKEN=mysecuretoken

Python app.py
#

from dotenv import find_dotenv, load_dotenv
from transformers import pipeline
from PIL import Image
from scipy.io.wavfile import write as write_wav
import streamlit as st
import numpy as np
import os

# Load variables from .env file
load_dotenv(find_dotenv())

# Function load model: image2text
@st.cache_resource  # Cache the model
def load_model_img_to_text():
    return pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
        device="cuda:0"
    )  # Omit device to let it auto-detect

# Function load model: text2speech
@st.cache_resource
def load_model_text_to_speech():
    return pipeline(
        "text-to-speech",
        model="suno/bark-small",
        device="cuda:0",
    )  # Omit device to let it auto-detect

# Call function for image2text model > becomes pipeline object
model_img_to_text = load_model_img_to_text()

# Call function for text_to_speech > becomes pipeline object
model_text_to_speech = load_model_text_to_speech()


# Function: text2speech settings
def text_to_speech(text: str, output_file: str = "caption.wav") -> None:

    speech = model_text_to_speech(text)

    # Bark pipeline returns a float waveform and sampling rate
    audio_array = speech["audio"]
    sample_rate = speech["sampling_rate"]

    # Ensure numpy array
    audio_array = np.array(audio_array).astype(np.float32).flatten()

    # Convert to 16-bit PCM for WAV
    audio_int16 = (audio_array * 32767).astype(np.int16)

    write_wav(output_file, sample_rate, audio_int16)
    print(f"Audio saved as: {output_file}")

# Streamlit UI
st.title("Image to Text to Audio Demo")
st.write("Upload an image and output the description with audio")

uploaded_file = st.file_uploader(
    "Upload image:",
    type=["jpg", "jpeg", "png"]
)

if uploaded_file is not None:
    # Open and show the image
    image = Image.open(uploaded_file).convert("RGB")
    st.image(image, caption="Uploaded image", width=700)

    if st.button("Generate caption & audio"):
        with st.spinner("Running model..."):
            # Image -> Text
            img_to_text = model_img_to_text(image)
            caption = img_to_text[0]["generated_text"]

            # Text -> Audio (save to WAV)
            audio_path = "caption.wav"
            text_to_speech(caption, audio_path)

            # Read WAV for Streamlit
            with open(audio_path, "rb") as f:
                audio_bytes = f.read()

        st.success("Caption:")
        st.write(caption)

        st.audio(audio_bytes, format="audio/wav")

Run App
#

# Run app
streamlit run app.py

# Shell output:
You can now view your Streamlit app in your browser.

Local URL: http://localhost:8501
Network URL: http://192.168.70.21:8501

# Open app in browser
http://192.168.70.21:8501

The Python Streamlit UI looks like this:

Setup Overview #

Python Prerequisites #

Create Python Venv #

Pip Requirements #

Create Pinned Version List #

Insall Requirements #

Python App #

Hugging Face Model #

File and Folder Structure #

.env File & HF API Token #

Python app.py #

Run App #