o �J�h )�@szddlmZmZddlZddlmZmZmZm Z m Z ddl Z ddl Z ddl Z ddlmZddlZddlTGdd�d�ZdS)�)� VadOptions� get_vad_modelN)�BinaryIO�Union�List�Optional�Tuple)�SpeechTimestampsMap)�*c @s�eZdZdd�Ze��fdeeee j fde dejde e j e effdd�Zd e��fde j d ee dejde efd d �Zd d�Zede j de ede j fdd��Ze  ddedededefdd��Z dde ede edeede efdd�Zd S) � SileroVADcCsd|_d|_d|_dS)Ni�>i)� sampling_rate�window_size_samples�model��self�r�>C:\pinokio\api\whisper-webui.git\app\modules\vad\silero_vad.py�__init__s zSileroVAD.__init__�audio�vad_parameters�progress�returncCs�|j}t|tj�stj||d�}|jd|}|}|dur!t�}n t|t�r-tdi|��}|j |||d�}|� ||�}|jd|}||fS)a Run VAD Parameters ---------- audio: Union[str, BinaryIO, np.ndarray] Audio path or file binary or Audio numpy array vad_parameters: Options for VAD processing. progress: gr.Progress Indicator to show progress directly in gradio. Returns ---------- np.ndarray Pre-processed audio with VAD List[dict] Chunks of speeches to be used to restore the timestamps later )r rN)r� vad_optionsrr) r � isinstance�np�ndarray�faster_whisper� decode_audio�shaper�dict�get_speech_timestamps�collect_chunks)rrrrr �duration�duration_after_vad� speech_chunksrrr�runs"  � z SileroVAD.runNrcKs�|jdur |��|durtdi|��}|j}|j}|j}|j}|j} |j} |j } |j |d} |j | d} |j || d| }|j | d}|j dd}t |�}t � |d| |jd| f�}|�|�dd���d�}d}g}i}|dur�t|d d �}d}d}}t|�D]�\}}||kr�|r�d}||kr�| |}||kr�|s�d }| ||d <q�|r�| ||d |kr�|r�||d <|�|�i}||kr�d}n||d <d}}}n| ||d <|�|�i}d}}}d}q�||k�r0|�r0|s�| |}| |||k�r|}| |||k�rq�||d <|d |d | k�r%|�|�i}d}}}d}q�q�|�rF||d | k�rF||d <|�|�t|�D]�\}}|dk�r`ttd|d | ��|d <|t |�dk�r�||dd |d }|d| k�r�|d t|d�7<ttd||dd |d��||dd <�qJtt||d | ��|d <ttd||dd | ��||dd <�qJtt||d | ��|d <�qJ|S)a�This method is used for splitting long audios into speech chunks using silero VAD. Args: audio: One dimensional float array. vad_options: Options for VAD processing. kwargs: VAD options passed as keyword arguments for backward compatibility. progress: Gradio progress to indicate progress. Returns: List of dicts containing begin and end samples of each speech chunk. N����br������Fg333333�?g{�G�z�?T�start�endr)r� update_modelr� threshold� neg_threshold�min_speech_duration_ms�max_speech_duration_s�min_silence_duration_msr � speech_pad_msr �lenr�padr�reshape�squeeze�max� enumerate�append�int�min)rrrr�kwargsr.r/r0r1r2r r3�min_speech_samples�speech_pad_samples�max_speech_samples�min_silence_samples�!min_silence_samples_at_max_speech�audio_length_samples� padded_audio� speech_probs� triggered�speeches�current_speech�temp_end�prev_end� next_start�i� speech_prob�speech�silence_durationrrrr Es� ����   �    �  ��  ��� �zSileroVAD.get_speech_timestampscCs t�|_dS�N)rrrrrrr-�s zSileroVAD.update_model�chunkscs,|s tjgtjd�St��fdd�|D��S)z'Collects and concatenates audio chunks.)�dtypecs g|] }�|d|d��qS)r+r,r)�.0�chunk�rrr� <listcomp>�s z,SileroVAD.collect_chunks.<locals>.<listcomp>)r�array�float32� concatenate)rrQrrUrr!�szSileroVAD.collect_chunksF�.�seconds�always_include_hours�decimal_markercCs�|dksJd��t|d�}|d}||d8}|d}||d8}|d}||d8}|s2|dkr8|d�d�nd }|�|d�d|d�|�|d ��S) Nrznon-negative timestamp expectedg@�@i��6i`�r&�02d�:��03d)�round)r[r\r]� milliseconds�hours�minutes� hours_markerrrr�format_timestamp�s    �zSileroVAD.format_timestamp�segmentsr$r c Cs�|dur|j}t||�}|D]M}|jrMg}|jD]$}|j|jd}|�|�} |�|j| �|_|�|j| �|_|�|�q|dj|_|dj|_||_q|�|j�|_|�|j�|_q|S)Nr'rr*)r r �wordsr+r,�get_chunk_index�get_original_timer:) rrhr$r �ts_map�segmentri�word�middle� chunk_indexrrr�restore_speech_timestamps�s$      z#SileroVAD.restore_speech_timestamps)FrZrP)�__name__� __module__� __qualname__r�gr�Progressr�strrrrrrrrr%rr r-� staticmethodr!�float�boolrg�Segmentr;rqrrrrr sb���� �3���� � ���� ������r )�faster_whisper.vadrr�numpyr�typingrrrrr�warnings�bisectr�faster_whisper.transcriber �gradioru�modules.whisper.data_classesr rrrr�<module>s 
Memory