Skip to main content

Modules Guide

Complete guide to Audio Capture, Speech-to-Text, and Language Models.

Overview

ElysGenAI provides three integrated modules:

Microphone → Audio Capture → STT → Transcribed Text → LLM → Generated Response

Modules:

  1. Audio Capture - Microphone input and routing
  2. STT - Speech-to-text using Whisper
  3. LLM - Language models using Phi-3-mini

Audio Capture System

Quick Start

// Get subsystem
UERP_AudioCaptureSubsystem* AudioSubsystem =
GetGameInstance()->GetSubsystem<UERP_AudioCaptureSubsystem>();

// Start capturing
AudioSubsystem->StartCapture();

Consumer Pattern

Implement IERP_AudioConsumer to receive audio:

UCLASS()
class UMyAudioConsumer : public UObject, public IERP_AudioConsumer
{
GENERATED_BODY()

public:
virtual void OnAudioDataReceived_Implementation(const FERP_AudioBuffer& Buffer) override
{
// Process audio
ProcessAudio(Buffer.AudioData);
}

virtual FString GetConsumerName_Implementation() const override
{
return TEXT("MyAudioConsumer");
}
};

Register Consumer

void UMyComponent::BeginPlay()
{
Super::BeginPlay();

UERP_AudioCaptureSubsystem* AudioSubsystem =
GetGameInstance()->GetSubsystem<UERP_AudioCaptureSubsystem>();

if (AudioSubsystem)
{
AudioSubsystem->RegisterConsumer(this);
}
}

void UMyComponent::EndPlay(const EEndPlayReason::Type Reason)
{
if (UERP_AudioCaptureSubsystem* AudioSubsystem =
GetGameInstance()->GetSubsystem<UERP_AudioCaptureSubsystem>())
{
AudioSubsystem->UnregisterConsumer(this);
}

Super::EndPlay(Reason);
}

Push-to-Talk Modes

UENUM(BlueprintType)
enum class EElysPushToTalkMode : uint8
{
AlwaysOn, // Continuous capture
PushToTalk, // Hold key to talk
PushToMute // Hold key to mute
};

Configuration:

AudioSubsystem->SetPushToTalkMode(EElysPushToTalkMode::PushToTalk);
AudioSubsystem->SetPushToTalkActive(true); // Key pressed
AudioSubsystem->SetPushToTalkActive(false); // Key released

Input Binding:

// In PlayerController
void AMyPlayerController::SetupInputComponent()
{
Super::SetupInputComponent();

InputComponent->BindAction("VoiceChat", IE_Pressed, this, &AMyPlayerController::StartVoiceChat);
InputComponent->BindAction("VoiceChat", IE_Released, this, &AMyPlayerController::StopVoiceChat);
}

void AMyPlayerController::StartVoiceChat()
{
auto* AudioSubsystem = GetGameInstance()->GetSubsystem<UERP_AudioCaptureSubsystem>();
AudioSubsystem->SetPushToTalkActive(true);
}

void AMyPlayerController::StopVoiceChat()
{
auto* AudioSubsystem = GetGameInstance()->GetSubsystem<UERP_AudioCaptureSubsystem>();
AudioSubsystem->SetPushToTalkActive(false);
}

Audio Formats

USTRUCT(BlueprintType)
struct FERP_AudioFormat
{
int32 SampleRate; // 16000 (STT), 48000 (voice chat)
int32 NumChannels; // 1 (mono), 2 (stereo)
int32 BitDepth; // 16
};

STT Default: 16kHz mono 16-bit Voice Chat Default: 48kHz stereo 16-bit

Mute Control

// Mute microphone
AudioSubsystem->SetMuted(true);

// Check mute status
bool bIsMuted = AudioSubsystem->IsMuted();

Speech-to-Text (STT)

Quick Setup

// 1. Add component to Actor
UPROPERTY(VisibleAnywhere, BlueprintReadOnly)
UERP_STTComponent* STTComponent;

// 2. Bind event
STTComponent->OnTranscriptionComplete.AddDynamic(
this, &AMyActor::OnTranscriptionReceived);

// 3. Start listening
STTComponent->StartListening();

// 4. Handle results
void AMyActor::OnTranscriptionReceived(const FERP_STTResult& Result)
{
UE_LOG(LogTemp, Log, TEXT("Transcription: %s"), *Result.TranscribedText);
}

Component API

StartListening:

UFUNCTION(BlueprintCallable, Category="ElysGenAI|STT")
void StartListening();

StopListening:

UFUNCTION(BlueprintCallable, Category="ElysGenAI|STT")
void StopListening();

IsListening:

UFUNCTION(BlueprintPure, Category="ElysGenAI|STT")
bool IsListening() const;

SetLanguageCode:

UFUNCTION(BlueprintCallable, Category="ElysGenAI|STT")
void SetLanguageCode(const FString& LanguageCode);

Configuration

Component Properties:

UPROPERTY(EditAnywhere, BlueprintReadWrite, Category="ElysGenAI|STT")
bool bAutoStartListening = false;

UPROPERTY(EditAnywhere, BlueprintReadWrite, Category="ElysGenAI|STT")
FString LanguageCode = TEXT("en");

UPROPERTY(EditAnywhere, BlueprintReadWrite, Category="ElysGenAI|STT")
bool bEnableVAD = true; // Voice activity detection

UPROPERTY(EditAnywhere, BlueprintReadWrite, Category="ElysGenAI|STT")
float MinConfidence = 0.5f;

Project Settings: Project Settings → Elys GenAI Framework → STT

  • Backend: Whisper
  • Model Path: (empty = use bundled)
  • NumThreads: 4 (match CPU cores)
  • Enable VAD: true

Events

OnTranscriptionComplete:

DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(
FElysSTTResultDelegate,
const FERP_STTResult&, Result
);

UPROPERTY(BlueprintAssignable, Category="ElysGenAI|STT")
FElysSTTResultDelegate OnTranscriptionComplete;

Result Structure:

USTRUCT(BlueprintType)
struct FERP_STTResult
{
UPROPERTY(BlueprintReadOnly)
FString TranscribedText;

UPROPERTY(BlueprintReadOnly)
float Confidence; // 0.0-1.0

UPROPERTY(BlueprintReadOnly)
FString Language;

UPROPERTY(BlueprintReadOnly)
bool bIsFinal;
};

Whisper Models

ModelSizeUse Case
tiny~75 MBTesting, prototyping
base.en~74 MBRecommended: English only
base~142 MBMultilingual (99+ languages)
small~466 MBBetter accuracy
medium~1.5 GBHigh accuracy
large~3 GBBest accuracy

Supported Languages: en, es, fr, de, it, pt, nl, ru, zh, ja, ko, ar, hi, and 87+ more

Voice Activity Detection (VAD)

VAD filters silence automatically:

STTComponent->SetEnableVAD(true);  // Enabled by default

// Adjust sensitivity (0.0-1.0)
// Higher = more aggressive filtering
STTComponent->SetVADThreshold(0.5f);

When to adjust:

  • Noisy environment: Increase threshold (0.6-0.8)
  • Quiet environment: Decrease threshold (0.3-0.5)

Performance Tuning

Thread Count:

// Project Settings → STT → NumThreads
// Set to CPU core count for best performance
NumThreads = 4; // For quad-core CPU

Buffer Duration:

// Project Settings → Audio → Buffer Duration
BufferDuration = 100ms; // Lower = less latency, higher = better accuracy

Language Models (LLM)

Quick Setup

// 1. Add component
UPROPERTY(VisibleAnywhere)
UERP_LLMComponent* LLMComponent;

// 2. Set system prompt
LLMComponent->SetSystemPrompt(TEXT("You are a friendly merchant NPC."));

// 3. Bind event
LLMComponent->OnGenerationComplete.AddDynamic(
this, &AMyNPC::OnDialogueGenerated);

// 4. Send message
LLMComponent->SendMessage(TEXT("What are you selling?"));

// 5. Handle response
void AMyNPC::OnDialogueGenerated(const FERP_LLMResult& Result)
{
DisplayDialogue(Result.GeneratedText);
}

Component API

SendMessage:

UFUNCTION(BlueprintCallable, Category="ElysGenAI|LLM")
void SendMessage(const FString& Message);

SetSystemPrompt:

UFUNCTION(BlueprintCallable, Category="ElysGenAI|LLM")
void SetSystemPrompt(const FString& Prompt);

ClearHistory:

UFUNCTION(BlueprintCallable, Category="ElysGenAI|LLM")
void ClearHistory();

Configuration

Component Properties:

UPROPERTY(EditAnywhere, Category="ElysGenAI|LLM")
FString SystemPrompt = TEXT("You are a helpful assistant.");

UPROPERTY(EditAnywhere, Category="ElysGenAI|LLM")
int32 MaxTokens = 256;

UPROPERTY(EditAnywhere, Category="ElysGenAI|LLM")
float Temperature = 0.7f; // 0.0-2.0

UPROPERTY(EditAnywhere, Category="ElysGenAI|LLM")
int32 MaxHistoryMessages = 20;

Project Settings: Project Settings → Elys GenAI Framework → LLM

  • Backend: LlamaCpp
  • Model Path: (empty = use bundled Phi-3)
  • ContextLength: 4096 tokens
  • NumThreads: 4

Events

OnGenerationComplete:

UPROPERTY(BlueprintAssignable)
FERP_LLMResultDelegate OnGenerationComplete;

Result Structure:

USTRUCT(BlueprintType)
struct FERP_LLMResult
{
UPROPERTY(BlueprintReadOnly)
FString GeneratedText;

UPROPERTY(BlueprintReadOnly)
int32 TokenCount;

UPROPERTY(BlueprintReadOnly)
EElysLLMFinishReason FinishReason; // Completed, Length, Stop
};

OnTokenGenerated (Streaming):

UPROPERTY(BlueprintAssignable)
FERP_LLMTokenDelegate OnTokenGenerated;

Use for typewriter effects:

LLMComponent->OnTokenGenerated.AddDynamic(this, &AMyNPC::OnToken);

void AMyNPC::OnToken(const FString& Token)
{
DialogueText += Token;
UpdateDialogueUI(DialogueText);
}

Bundled Model: Phi-3-mini

Specs:

  • Size: ~2.7GB (Q4 quantized)
  • Context: 4096 tokens
  • License: MIT
  • Speed: ~20 tokens/sec (CPU)

Use Cases:

  • NPC dialogue
  • Quest generation
  • Item descriptions
  • Dynamic storytelling

Temperature Guide

Controls creativity/randomness:

// 0.0-0.3: Factual, deterministic (game mechanics, tutorials)
LLMComponent->SetTemperature(0.3f);

// 0.4-0.7: Balanced (NPC dialogue, descriptions)
LLMComponent->SetTemperature(0.7f);

// 0.8-1.5: Creative (storytelling, humor)
LLMComponent->SetTemperature(1.2f);

System Prompt Best Practices

Clear Instructions:

FString SystemPrompt = TEXT(
"You are a wise wizard NPC named Gandor. "
"Keep responses under 50 words. "
"Speak in archaic English. "
"Never break character."
);

Few-Shot Examples:

FString SystemPrompt = TEXT(
"You are a merchant. Examples:\n"
"Player: 'What do you sell?'\n"
"You: 'Potions, weapons, and armor!'\n"
"Player: 'How much for a sword?'\n"
"You: '100 gold pieces.'"
);

Combined Examples

Voice-to-Dialogue Pipeline

// 1. Capture audio → 2. Transcribe → 3. Generate response

UCLASS()
class AMyNPC : public AActor
{
GENERATED_BODY()

public:
UPROPERTY(VisibleAnywhere)
UERP_STTComponent* STTComponent;

UPROPERTY(VisibleAnywhere)
UERP_LLMComponent* LLMComponent;

protected:
virtual void BeginPlay() override
{
Super::BeginPlay();

// Setup STT
STTComponent->SetLanguageCode(TEXT("en"));
STTComponent->SetAutoStartListening(true);
STTComponent->OnTranscriptionComplete.AddDynamic(
this, &AMyNPC::OnPlayerSpoke);

// Setup LLM
LLMComponent->SetSystemPrompt(TEXT("You are a friendly merchant."));
LLMComponent->OnGenerationComplete.AddDynamic(
this, &AMyNPC::OnDialogueGenerated);
}

UFUNCTION()
void OnPlayerSpoke(const FERP_STTResult& Result)
{
// Send transcription to LLM
LLMComponent->SendMessage(Result.TranscribedText);
}

UFUNCTION()
void OnDialogueGenerated(const FERP_LLMResult& Result)
{
// Display NPC response
DisplayDialogue(Result.GeneratedText);
}
};

Multi-Consumer Audio Routing

// Route audio to both STT and voice chat

UCLASS()
class AMyPlayerController : public APlayerController
{
GENERATED_BODY()

public:
UPROPERTY(VisibleAnywhere)
UERP_STTComponent* STTComponent;

UPROPERTY(VisibleAnywhere)
UVoiceChatComponent* VoiceChatComponent;

protected:
virtual void BeginPlay() override
{
Super::BeginPlay();

// Both components automatically register as audio consumers
// Audio flows to both simultaneously
STTComponent->StartListening();
VoiceChatComponent->StartTransmitting();
}
};

Settings Reference

Audio Settings

SettingDefaultDescription
Sample Rate16000 HzAudio capture sample rate
Channels1 (Mono)Audio channels
Bit Depth16Audio bit depth
Buffer Duration100msAudio buffer size

STT Settings

SettingDefaultDescription
STT BackendWhisperBackend implementation
Model Path(bundled)Path to Whisper model
LanguageenTarget language code
Enable VADtrueVoice activity detection
Num Threads4Inference threads

LLM Settings

SettingDefaultDescription
LLM BackendLlamaCppBackend implementation
Model Path(bundled)Path to Phi-3 model
Context Length4096Maximum context tokens
Temperature0.7Sampling temperature
Max Tokens512Maximum generation length
Num Threads4Inference threads

Next Steps