o �J�hb�@sFddlZddlZddlZddlZddlZddlmZmZddl m Z m Z m Z m Z mZddlZddlmZddlmZddlZddlmZddlZddlmZddlmZmZmZmZmZdd l Tdd l!m"Z"dd l#Tdd l$m%Z%m&Z&dd l'm(Z(m)Z)m*Z*m+Z+m,Z,dd l-m.Z.dd l/Tddl0m1Z1ddl2m3Z3e"�Z4Gdd�de�Z5dS)�N)�ABC�abstractmethod)�BinaryIO�Union�Tuple�List�Callable)�datetime)� VadOptions)�deepcopy)�MusicSeparator)�WHISPER_MODELS_DIR�DIARIZATION_MODELS_DIR� OUTPUT_DIR�DEFAULT_PARAMETERS_CONFIG_PATH�UVR_MODELS_DIR)�*)� get_logger)� get_ytdata� get_ytaudio)�get_media_files�format_gradio_files� load_yaml� save_yaml� read_file)�validate_audio)�Diarizer)� SileroVADc@sHeZdZeeeefdedededefdd�Ze e � �dfde ee ejfd e j d eefd d ��Ze e � �fd eded e j fdd��Ze � �dddfde ee ejfd e j deded eedeeeeff dd�Zdddddde � �fdeedeedeedeedededeeeffdd�Zdde � �fdedededeeeffdd �Zdde � �fd!edededeeeffd"d#�Zd$d%�Zd&d'�Zd(d)�Ze d*edefd+d,��Z!e d-d.��Z"e d/d0��Z#e d1eefd2d3��Z$e d4e%fd5d6��Z&e  d>d4e%dedefd7d8��Z'e 9 d?de eejfd:e(d;ee(dejfd<d=��Z)dS)@�BaseTranscriptionPipeline� model_dir�diarization_model_dir� uvr_model_dir� output_dircCs�||_||_tj|jdd�tj|jdd�t|d�|_t�|_t|tj � |d�d�|_ d|_ d|_ t��|_tttjj����|_|��|_|��|_|��|_dS)NT)�exist_ok)r�UVR)rr")rr"�os�makedirsr�diarizerr�vadr �path�join�music_separator�model�current_model_size�whisper�available_models�sorted�list� tokenizer� LANGUAGES�values�available_langs� get_device�device�get_available_compute_type�available_compute_types�get_compute_type�current_compute_type)�selfrr r!r"�r=�SC:\pinokio\api\whisper-webui.git\app\modules\whisper\base_transcription_pipeline.py�__init__!s&� �   z"BaseTranscriptionPipeline.__init__N�audio�progress�progress_callbackcG�dS)z%Inference whisper model to transcribeNr=)r<r@rArB�whisper_paramsr=r=r>� transcribe<sz$BaseTranscriptionPipeline.transcribe� model_size� compute_typecCrC)zInitialize whisper modelNr=)r<rFrGrAr=r=r>� update_modelFsz&BaseTranscriptionPipeline.update_model�SRTT� file_format� add_timestamp�returncGs(t��}t|�st�gdfSt�t|��}|�|�}|j|j|j |j f\} } } } | j rl|j j || j| j| j| j|d�\} }}|jdkr^|jdd�}|j jdurRd}n|j jj}|j||d�}| jrf|j ��t��|}t|�}| jr�|dd d �t| j| j| j| j| j d �}|jj!|||d �\}}|j"dkr�|}nd | _|j#|||g| �$��R�\}}| jr�|��| jr�|jj%||d�}|r�|}nt&�'d�| j(r�|ddd �|j)j!|| j*r�| j*nt+j,�-d�|| j.d�\}}| jr�|j)��|j/|||d�|�st&�'d�t�g}|ddd �t��|}||fS)a� Run transcription with conditional pre-processing and post-processing. The VAD will be performed to remove noise from the audio input in pre-processing, if enabled. The diarization will be performed in post-processing, if enabled. Due to the integration with gradio, the parameters have to be specified with a `*` wildcard. Parameters ---------- audio: Union[str, BinaryIO, np.ndarray] Audio input. This can be file path or binary type. progress: gr.Progress Indicator to show progress directly in gradio. file_format: str Subtitle file format between ["SRT", "WebVTT", "txt", "lrc"] add_timestamp: bool Whether to add a timestamp at the end of the filename. progress_callback: Optional[Callable] callback function to show progress. Can be used to update progress in the backend. *pipeline_params: tuple Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class. This must be provided as a List with * wildcard because of the integration with gradio. See more info at : https://github.com/gradio-app/gradio/issues/2471 Returns ---------- segments_result: List[Segment] list of Segment that includes start, end timestamps and transcribed text elapsed_time: float elapsed time for running r)r@� model_namer7� segment_size� save_filerA��)�axisN�>)r@�original_sample_ratez#Filtering silent parts from audio..��desc)� threshold�min_speech_duration_ms�max_speech_duration_s�min_silence_duration_ms� speech_pad_ms)r@�vad_parametersrAF)�segments� speech_chunksz-VAD detected no speech segments in the audio.g�G�z��?zDiarizing speakers..�HF_TOKEN)r@�use_auth_tokenZtranscribed_resultr7)�paramsrJrKz:Whisper did not detected any speech segments in the audio.g�?z Finished.)0�timer�Segment�TranscriptionPipelineParams� from_listr1�validate_gradio_values�bgm_separationr(r.� diarization�is_separate_bgmr+Zseparate�uvr_model_size� uvr_devicerNrO�ndim�meanZ audio_info� sample_rate�resample_audio�enable_offload�offloadr � vad_filterr rWrXrYrZr[�run�sizerE�to_list�restore_speech_timestamps�logger�info� is_diarizer'�hf_tokenr%�environ�get�diarization_device�cache_parameters)r<r@rArJrKrB�pipeline_params� start_timeraZ bgm_params� vad_paramsrD�diarization_paramsZmusic�_Zorigin_sample_rateZelapsed_time_bgm_sepZ origin_audio� vad_optionsZ vad_processedr^�resultZelapsed_time_transcriptionZrestored_resultZelapsed_time_diarizationZtotal_elapsed_timer=r=r>rsOs�'   �      � � � ��   � �   zBaseTranscriptionPipeline.run�files�input_folder_path�include_subdirectory� save_same_dirc Gs�z�t�t|��} d| jjrdndi} |rt||d�}t|t�r"|g}|r4t|dtj j �r4dd�|D�}i} |D]P} |j | |||dg|�R�\} }t j �t j �| ��\}}|rn|rnt j �| �}td|||| |d �| ��\}}td|j||| |d �| ��\}}t|�||d �| |<q8d }d}| ��D]\}}|d 7}||�d �7}||d�7}||d7}q�d|�|��d|��}dd�| ��D�}||fWSty�}ztd|���|�d}~ww)a Write subtitle file from Files Parameters ---------- files: list List of files to transcribe from gr.Files() input_folder_path: Optional[str] Input folder path to transcribe from gr.Textbox(). If this is provided, `files` will be ignored and this will be used instead. include_subdirectory: Optional[str] When using `input_folder_path`, whether to include all files in the subdirectory or not save_same_dir: Optional[str] When using `input_folder_path`, whether to save output in the same directory as inputs or not, in addition to the original output directory. This feature is only available when using `input_folder_path`, because gradio only allows to use cached file path in the function yet. file_format: str Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt] add_timestamp: bool Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename. progress: gr.Progress Indicator to show progress directly in gradio. *pipeline_params: tuple Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class Returns ---------- result_str: Result of transcription to return to gr.Textbox() result_file_path: Output file path to return to gr.Files() �highlight_wordsTF)�include_sub_directoryrcSsg|]}|j�qSr=)�name)�.0�filer=r=r>� <listcomp> sz=BaseTranscriptionPipeline.transcribe_file.<locals>.<listcomp>N�r"�output_file_name� output_formatr�rK)�subtitle� time_for_taskr)�z%------------------------------------ z r�r��Done in z&! Subtitle is in the outputs folder. cSsg|]}|d�qS)r)r=)r�rxr=r=r>r�9szError transcribing file: r=)rdrer1r.�word_timestampsr� isinstance�str�gr�utils� NamedStringrsr%r)�splitext�basename�dirname� generate_filer"r�items� format_timer4� Exception� RuntimeError)r<r�r�r�r�rJrKrArra�writer_optionsZ files_infor��transcribed_segmentsr�� file_name�file_extr"r�� file_pathZ total_result� total_timerx� result_strZresult_file_path�er=r=r>�transcribe_file�sr*�  � �  � � � � ��z)BaseTranscriptionPipeline.transcribe_file� mic_audioc Gs�zNt�t|��}d|jjrdndi}|ddd�|j||||dg|�R�\}} |dd d�d } td|j| |||d �|��\} } d |�| ��d | ��} | | fWSt yb}zt d|���|�d}~ww)aR Write subtitle file from microphone Parameters ---------- mic_audio: str Audio file path from gr.Microphone() file_format: str Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt] add_timestamp: bool Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename. progress: gr.Progress Indicator to show progress directly in gradio. *pipeline_params: tuple Parameters related with whisper. This will be dealt with "WhisperParameters" data class Returns ---------- result_str: Result of transcription to return to gr.Textbox() result_file_path: Output file path to return to gr.Files() r�TFrzLoading Audio..rUNrQ� Completed!�Micr�r��+! Subtitle file is in the outputs folder. zError transcribing mic: r=) rdrer1r.r�rsr�r"r�r�r�)r<r�rJrKrArrar�r�r�r�r�r�r�r�r=r=r>�transcribe_mic@s>� � � � � ��z(BaseTranscriptionPipeline.transcribe_mic� youtube_linkc Gs�zdt�t|��}d|jjrdndi}|ddd�t|�}t|�} |j| |||dg|�R�\} } |dd d�t|j �} t d|j | || |d �|��\} }d |� | ��d | ��}t j�| �r`t �| �||fWStyx}ztd |���|�d}~ww)af Write subtitle file from Youtube Parameters ---------- youtube_link: str URL of the Youtube video to transcribe from gr.Textbox() file_format: str Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt] add_timestamp: bool Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename. progress: gr.Progress Indicator to show progress directly in gradio. *pipeline_params: tuple Parameters related with whisper. This will be dealt with "WhisperParameters" data class Returns ---------- result_str: Result of transcription to return to gr.Textbox() result_file_path: Output file path to return to gr.Files() r�TFrzLoading Audio from Youtube..rUNrQr�r�r�r�zError transcribing youtube: r=)rdrer1r.r�rrrsZ safe_filename�titler�r"r�r%r)�exists�remover�r�)r<r�rJrKrArrar�Zytr@r�r�r�r�r�r�r�r=r=r>�transcribe_youtube~sF� � � � �   ��z,BaseTranscriptionPipeline.transcribe_youtubecCs&d|jvrdSd|jvrdS|jdS)N�float16�float32r)r9�r<r=r=r>r:�s   z*BaseTranscriptionPipeline.get_compute_typecCs&|jdkr tt�d��Stt�d��S)N�cuda�cpu)r7r1� ctranslate2�get_supported_compute_typesr�r=r=r>r8�s z4BaseTranscriptionPipeline.get_available_compute_typecCsf|jdur |`d|_|jdkrtj��tj��|jdkr-tj��tj��tj��t � �dS)z(Offload the model and free up the memoryNr��xpu) r,r7�torchr�� empty_cache�reset_max_memory_allocatedr��reset_accumulated_memory_stats�reset_peak_memory_stats�gc�collectr�r=r=r>rq�s         z!BaseTranscriptionPipeline.offload� elapsed_timecCsbt|d�\}}t|d�\}}d}|r||�d�7}|r"||�d�7}t|�}||�d�7}|��S)z� Get {hours} {minutes} {seconds} time format string Parameters ---------- elapsed_time: str Elapsed time for transcription Returns ---------- Time format string i�<r�z hours z minutes z seconds)�divmod�round�strip)r��hours�rem�minutes�secondsZtime_strr=r=r>r��sz%BaseTranscriptionPipeline.format_timecCs<tj��rdStj��rdStjj��rt��sdSdSdS)Nr�r�r��mps)r�r�� is_availabler��backendsr�r�is_sparse_api_supportedr=r=r=r>r6�s   z$BaseTranscriptionPipeline.get_devicecCsdtjj��sdSzt�d�}tjt�ddgddgg�t�ddg�d|d�}Wd Sty1YdSw) NFr�rrQrP�)�r�)�indicesr4rtr7T)r�r�r�r�r7�sparse_coo_tensor�tensorr�)r7Z sparse_tensorr=r=r>r� s   � �z1BaseTranscriptionPipeline.is_sparse_api_supported� file_pathscCs0|sdS|D]}|rtj�|�rt�|�qdS)zRemove gradio cached filesN)r%r)r�r�)r�r�r=r=r>�remove_input_filess ��z,BaseTranscriptionPipeline.remove_input_filesracCs�|jjdurn|jjtkrd|j_ndd�tjj��D�}||jj|j_|jjtkr/d|j_|jjtkr9d|j_|jj tkrCd|j_ |jj t krMd|j_ |jj t krWd|j_ |jj t krad|j_ |jjtkrmtd�|j_|S)z� Validate gradio specific values that can't be displayed as None in the UI. Related issue : https://github.com/gradio-app/gradio/issues/8723 NcSsi|]\}}||�qSr=r=)r��key�valuer=r=r>� <dictcomp>0szDBaseTranscriptionPipeline.validate_gradio_values.<locals>.<dictcomp>�inf)r.�lang�AUTOMATIC_DETECTIONr2r3r��initial_prompt�GRADIO_NONE_STR�prefix�hotwords�max_new_tokens�GRADIO_NONE_NUMBER_MIN�hallucination_silence_threshold�language_detection_thresholdr(rY�GRADIO_NONE_NUMBER_MAX�float)raZlanguage_code_dictr=r=r>rf%s*           z0BaseTranscriptionPipeline.validate_gradio_valuescCs�tt�}|��}i|�|�}||dd<||dd<|d�dd�}|r1t|t�r1t|�|dd<|d�dd�durDt��|dd<nt j j }||dd|dd<|d�dt d ��t d �krht |dd<|duru|rwt|t�dSdSdS) z!Cache parameters to the yaml filer.rKrJ�suppress_tokensNr�r(rYr�)rr�to_dictr|r�r1r�r��unwrapr.r2r3r�r�r)rarJrKZ cached_paramsZparam_to_cacheZ cached_yamlZ supress_tokenZ language_dictr=r=r>r~Cs"     �z*BaseTranscriptionPipeline.cache_parametersrS�new_sample_raterTcCsTt|t�r t�|�\}}n |durtd��t�|�}tjj||d�}||�� �}|S)z=Resamples audio to 16k sample rate, standard on Whisper modelNz@original_sample_rate must be provided when audio is numpy array.)Z orig_freqZnew_freq) r�r�� torchaudio�load� ValueErrorr�� from_numpy� transformsZResample�numpy)r@r�rT� resamplerZresampled_audior=r=r>roas   z(BaseTranscriptionPipeline.resample_audio)rIT)rSN)*�__name__� __module__� __qualname__r rrrr�r?rr��Progressrr�np�ndarray�OptionalrrErH�boolrrrcr�rsr�r�r�r:r8rq� staticmethodr�r6r�r�rdrfr~�intror=r=r=r>r s���� ���� � ��� � ������ � ������� �j����  �@����  �G   ��� �����r)6r%r.r��gradior�r��abcrr�typingrrrrrr�r�r �faster_whisper.vadr r��copyr rbZmodules.uvr.music_separatorr �modules.utils.pathsr rrrr�modules.utils.constants�modules.utils.loggerrZmodules.utils.subtitle_manager�modules.utils.youtube_managerrr�modules.utils.files_managerrrrrrZmodules.utils.audio_managerr�modules.whisper.data_classesZmodules.diarize.diarizerrZmodules.vad.silero_vadrrwrr=r=r=r>�<module>s4        
Memory