o �J�h )�@szddlmZmZddlZddlmZmZmZm Z m Z ddlZddlZddl Z ddlmZddlZddlTGdd�d�ZdS)�)� VadOptions� get_vad_modelN)�BinaryIO�Union�List�Optional�Tuple)�SpeechTimestampsMap)�*c@s�eZdZdd�Ze��fdeeee j fdedejdee j e effdd�Zd e��fde j d eedejde efdd�Zd d�Zede j de ede j fdd��Ze ddedededefdd��Z dde ede edeede efdd�Zd S) � SileroVADcCsd|_d|_d|_dS)Ni�>i)� sampling_rate�window_size_samples�model��self�r�>C:\pinokio\api\whisper-webui.git\app\modules\vad\silero_vad.py�__init__s zSileroVAD.__init__�audio�vad_parameters�progress�returncCs�|j}t|tj�stj||d�}|jd|}|}|dur!t�}nt|t�r-tdi|��}|j |||d�}|� ||�}|jd|}||fS)a Run VAD Parameters ---------- audio: Union[str, BinaryIO, np.ndarray] Audio path or file binary or Audio numpy array vad_parameters: Options for VAD processing. progress: gr.Progress Indicator to show progress directly in gradio. Returns ---------- np.ndarray Pre-processed audio with VAD List[dict] Chunks of speeches to be used to restore the timestamps later )rrN)r�vad_optionsrr)r� isinstance�np�ndarray�faster_whisper�decode_audio�shaper�dict�get_speech_timestamps�collect_chunks)rrrrr�duration�duration_after_vad� speech_chunksrrr�runs" �z SileroVAD.runNrcKs�|jdur |��|durtdi|��}|j}|j}|j}|j}|j} |j} |j }|j |d}|j |d} |j || d| }|j | d}|j dd}t|�}t� |d| |jd| f�}|�|�dd��d�}d}g}i}|dur�t|d d �}d}d}}t|�D]�\}}||kr�|r�d}||kr�| |}||kr�|s�d}| ||d<q�|r�| ||d|kr�|r�||d <|�|�i}||kr�d}n||d<d}}}n| ||d <|�|�i}d}}}d}q�||k�r0|�r0|s�| |}| |||k�r|}| |||k�rq�||d <|d |d|k�r%|�|�i}d}}}d}q�q�|�rF||d|k�rF||d <|�|�t|�D]�\}}|dk�r`ttd|d| ��|d<|t|�dk�r�||dd|d }|d| k�r�|d t|d�7<ttd||dd|d��||dd<�qJtt||d | ��|d <ttd||dd| ��||dd<�qJtt||d | ��|d <�qJ|S)a�This method is used for splitting long audios into speech chunks using silero VAD. Args: audio: One dimensional float array. vad_options: Options for VAD processing. kwargs: VAD options passed as keyword arguments for backward compatibility. progress: Gradio progress to indicate progress. Returns: List of dicts containing begin and end samples of each speech chunk. N��br��Fg333333�?g{�G�z�?T�start�endr)r�update_modelr� threshold� neg_threshold�min_speech_duration_ms�max_speech_duration_s�min_silence_duration_msr � speech_pad_msr�lenr�padr�reshape�squeeze�max� enumerate�append�int�min)rrrr�kwargsr.r/r0r1r2r r3�min_speech_samples�speech_pad_samples�max_speech_samples�min_silence_samples�!min_silence_samples_at_max_speech�audio_length_samples�padded_audio�speech_probs� triggered�speeches�current_speech�temp_end�prev_end� next_start�i�speech_prob�speech�silence_durationrrrr Es� �� zSileroVAD.get_speech_timestampscCst�|_dS�N)rrrrrrr-�szSileroVAD.update_model�chunkscs,|s tjgtjd�St��fdd�|D��S)z'Collects and concatenates audio chunks.)�dtypecs g|]}�|d|d��qS)r+r,r)�.0�chunk�rrr� <listcomp>�s z,SileroVAD.collect_chunks.<locals>.<listcomp>)r�array�float32�concatenate)rrQrrUrr!�szSileroVAD.collect_chunksF�.�seconds�always_include_hours�decimal_markercCs�|dksJd��t|d�}|d}||d8}|d}||d8}|d}||d8}|s2|dkr8|d�d�nd }|�|d�d|d�|�|d ��S)Nrznon-negative timestamp expectedg@�@i��6i`�r&�02d�:��03d)�round)r[r\r]�milliseconds�hours�minutes�hours_markerrrr�format_timestamp�s�zSileroVAD.format_timestamp�segmentsr$rc Cs�|dur|j}t||�}|D]M}|jrMg}|jD]$}|j|jd}|�|�} |�|j| �|_|�|j| �|_|�|�q|dj|_|dj|_||_q|�|j�|_|�|j�|_q|S)Nr'rr*)rr �wordsr+r,�get_chunk_index�get_original_timer:) rrhr$r�ts_map�segmentri�word�middle�chunk_indexrrr�restore_speech_timestamps�s$ z#SileroVAD.restore_speech_timestamps)FrZrP)�__name__� __module__�__qualname__r�gr�Progressr�strrrrrrrrr%rr r-�staticmethodr!�float�boolrg�Segmentr;rqrrrrrsb�� 3�� r)�faster_whisper.vadrr�numpyr�typingrrrrr�warnings�bisectr�faster_whisper.transcriber �gradioru�modules.whisper.data_classesrrrrr�<module>s