o �J�hb�@sFddlZddlZddlZddlZddlZddlmZmZddl m Z mZmZm Z mZddlZddlmZddlmZddlZddlmZddlZddlmZddlmZmZmZmZmZdd l Tdd l!m"Z"dd l#Tddl$m%Z%m&Z&ddl'm(Z(m)Z)m*Z*m+Z+m,Z,dd l-m.Z.dd l/Tddl0m1Z1ddl2m3Z3e"�Z4Gdd�de�Z5dS)�N)�ABC�abstractmethod)�BinaryIO�Union�Tuple�List�Callable)�datetime)� VadOptions)�deepcopy)�MusicSeparator)�WHISPER_MODELS_DIR�DIARIZATION_MODELS_DIR� OUTPUT_DIR�DEFAULT_PARAMETERS_CONFIG_PATH�UVR_MODELS_DIR)�*)� get_logger)� get_ytdata�get_ytaudio)�get_media_files�format_gradio_files� load_yaml� save_yaml� read_file)�validate_audio)�Diarizer)� SileroVADc@sHeZdZeeeefdedededefdd�Ze e ��dfdeee ejfd e jd eefdd��Ze e ��fd eded e jfdd��Ze ��dddfdeee ejfd e jdeded eedeeeeffdd�Zdddddde ��fdeedeedeedeedededeeeffdd�Zdde ��fdedededeeeffdd �Zdde ��fd!edededeeeffd"d#�Zd$d%�Zd&d'�Zd(d)�Ze d*edefd+d,��Z!e d-d.��Z"e d/d0��Z#e d1eefd2d3��Z$e d4e%fd5d6��Z&e d>d4e%dedefd7d8��Z'e 9 d?deeejfd:e(d;ee(dejfd<d=��Z)dS)@�BaseTranscriptionPipeline� model_dir�diarization_model_dir� uvr_model_dir� output_dircCs�||_||_tj|jdd�tj|jdd�t|d�|_t�|_t|tj � |d�d�|_d|_d|_ t��|_tttjj��|_|��|_|��|_|��|_dS)NT)�exist_ok)r�UVR)rr")rr"�os�makedirsr�diarizerr�vadr�path�join�music_separator�model�current_model_size�whisper�available_models�sorted�list� tokenizer� LANGUAGES�values�available_langs� get_device�device�get_available_compute_type�available_compute_types�get_compute_type�current_compute_type)�selfrr r!r"�r=�SC:\pinokio\api\whisper-webui.git\app\modules\whisper\base_transcription_pipeline.py�__init__!s&�� z"BaseTranscriptionPipeline.__init__N�audio�progress�progress_callbackcG�dS)z%Inference whisper model to transcribeNr=)r<r@rArB�whisper_paramsr=r=r>� transcribe<sz$BaseTranscriptionPipeline.transcribe� model_size�compute_typecCrC)zInitialize whisper modelNr=)r<rFrGrAr=r=r>�update_modelFsz&BaseTranscriptionPipeline.update_model�SRTT�file_format� add_timestamp�returncGs(t��}t|�st�gdfSt�t|��}|�|�}|j|j|j |j f\} } }}| jrl|jj || j| j| j| j|d�\} }}|jdkr^|jdd�}|jjdurRd}n|jjj}|j||d�}| jrf|j��t��|}t|�}| jr�|dd d �t| j| j| j| j| j d�}|jj!|||d�\}}|j"dkr�|}nd | _|j#|||g|�$��R�\}}|jr�|��| jr�|jj%||d�}|r�|}nt&�'d�|j(r�|ddd �|j)j!||j*r�|j*nt+j,�-d�||j.d�\}}|jr�|j)��|j/|||d�|�st&�'d�t�g}|ddd �t��|}||fS)a� Run transcription with conditional pre-processing and post-processing. The VAD will be performed to remove noise from the audio input in pre-processing, if enabled. The diarization will be performed in post-processing, if enabled. Due to the integration with gradio, the parameters have to be specified with a `*` wildcard. Parameters ---------- audio: Union[str, BinaryIO, np.ndarray] Audio input. This can be file path or binary type. progress: gr.Progress Indicator to show progress directly in gradio. file_format: str Subtitle file format between ["SRT", "WebVTT", "txt", "lrc"] add_timestamp: bool Whether to add a timestamp at the end of the filename. progress_callback: Optional[Callable] callback function to show progress. Can be used to update progress in the backend. *pipeline_params: tuple Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class. This must be provided as a List with * wildcard because of the integration with gradio. See more info at : https://github.com/gradio-app/gradio/issues/2471 Returns ---------- segments_result: List[Segment] list of Segment that includes start, end timestamps and transcribed text elapsed_time: float elapsed time for running r)r@� model_namer7�segment_size� save_filerA��)�axisN�>)r@�original_sample_ratez#Filtering silent parts from audio..��desc)� threshold�min_speech_duration_ms�max_speech_duration_s�min_silence_duration_ms� speech_pad_ms)r@�vad_parametersrAF)�segments� speech_chunksz-VAD detected no speech segments in the audio.g�G�z��?zDiarizing speakers..�HF_TOKEN)r@�use_auth_tokenZtranscribed_resultr7)�paramsrJrKz:Whisper did not detected any speech segments in the audio.g�?z Finished.)0�timer�Segment�TranscriptionPipelineParams� from_listr1�validate_gradio_values�bgm_separationr(r.�diarization�is_separate_bgmr+Zseparate�uvr_model_size� uvr_devicerNrO�ndim�meanZ audio_info�sample_rate�resample_audio�enable_offload�offloadr� vad_filterr rWrXrYrZr[�run�sizerE�to_list�restore_speech_timestamps�logger�info� is_diarizer'�hf_tokenr%�environ�get�diarization_device�cache_parameters)r<r@rArJrKrB�pipeline_params� start_timeraZ bgm_params� vad_paramsrD�diarization_paramsZmusic�_Zorigin_sample_rateZelapsed_time_bgm_sepZorigin_audio�vad_optionsZ vad_processedr^�resultZelapsed_time_transcriptionZrestored_resultZelapsed_time_diarizationZtotal_elapsed_timer=r=r>rsOs�' � � � �� zBaseTranscriptionPipeline.run�files�input_folder_path�include_subdirectory� save_same_dirc Gs�z�t�t|��} d| jjrdndi} |rt||d�}t|t�r"|g}|r4t|dtj j �r4dd�|D�}i}|D]P}|j||||dg|�R�\} }tj �tj �|��\}}|rn|rntj �|�}td|||| |d �| ��\}}td|j||| |d �| ��\}}t|�||d �||<q8d}d}|��D]\}}|d7}||�d �7}||d�7}||d7}q�d|�|��d|��}dd�|��D�}||fWSty�}ztd|��|�d}~ww)a Write subtitle file from Files Parameters ---------- files: list List of files to transcribe from gr.Files() input_folder_path: Optional[str] Input folder path to transcribe from gr.Textbox(). If this is provided, `files` will be ignored and this will be used instead. include_subdirectory: Optional[str] When using `input_folder_path`, whether to include all files in the subdirectory or not save_same_dir: Optional[str] When using `input_folder_path`, whether to save output in the same directory as inputs or not, in addition to the original output directory. This feature is only available when using `input_folder_path`, because gradio only allows to use cached file path in the function yet. file_format: str Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt] add_timestamp: bool Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename. progress: gr.Progress Indicator to show progress directly in gradio. *pipeline_params: tuple Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class Returns ---------- result_str: Result of transcription to return to gr.Textbox() result_file_path: Output file path to return to gr.Files() �highlight_wordsTF)�include_sub_directoryrcSsg|]}|j�qSr=)�name)�.0�filer=r=r>� <listcomp> sz=BaseTranscriptionPipeline.transcribe_file.<locals>.<listcomp>N�r"�output_file_name� output_formatr�rK)�subtitle� time_for_taskr)�z%------------------------------------ z r�r��Done in z&! Subtitle is in the outputs folder. cSsg|]}|d�qS)r)r=)r�rxr=r=r>r�9szError transcribing file: r=)rdrer1r.�word_timestampsr� isinstance�str�gr�utils�NamedStringrsr%r)�splitext�basename�dirname� generate_filer"r�items�format_timer4� Exception�RuntimeError)r<r�r�r�r�rJrKrArra�writer_optionsZ files_infor��transcribed_segmentsr�� file_name�file_extr"r�� file_pathZtotal_result� total_timerx� result_strZresult_file_path�er=r=r>�transcribe_file�sr*� �� z)BaseTranscriptionPipeline.transcribe_file� mic_audioc Gs�zNt�t|��}d|jjrdndi}|ddd�|j||||dg|�R�\}} |dd d�d } td|j| |||d�|��\}}d|�| ��d |��} | |fWSt yb}zt d|��|�d}~ww)aR Write subtitle file from microphone Parameters ---------- mic_audio: str Audio file path from gr.Microphone() file_format: str Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt] add_timestamp: bool Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename. progress: gr.Progress Indicator to show progress directly in gradio. *pipeline_params: tuple Parameters related with whisper. This will be dealt with "WhisperParameters" data class Returns ---------- result_str: Result of transcription to return to gr.Textbox() result_file_path: Output file path to return to gr.Files() r�TFrzLoading Audio..rUNrQ� Completed!�Micr�r��+! Subtitle file is in the outputs folder. zError transcribing mic: r=)rdrer1r.r�rsr�r"r�r�r�)r<r�rJrKrArrar�r�r�r�r�r�r�r�r=r=r>�transcribe_mic@s>�� z(BaseTranscriptionPipeline.transcribe_mic�youtube_linkc Gs�zdt�t|��}d|jjrdndi}|ddd�t|�}t|�} |j| |||dg|�R�\} }|dd d�t|j �}t d|j||| |d �|��\} }d|�|��d| ��}t j�| �r`t �| �||fWStyx}ztd |��|�d}~ww)af Write subtitle file from Youtube Parameters ---------- youtube_link: str URL of the Youtube video to transcribe from gr.Textbox() file_format: str Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt] add_timestamp: bool Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename. progress: gr.Progress Indicator to show progress directly in gradio. *pipeline_params: tuple Parameters related with whisper. This will be dealt with "WhisperParameters" data class Returns ---------- result_str: Result of transcription to return to gr.Textbox() result_file_path: Output file path to return to gr.Files() r�TFrzLoading Audio from Youtube..rUNrQr�r�r�r�zError transcribing youtube: r=)rdrer1r.r�rrrsZ safe_filename�titler�r"r�r%r)�exists�remover�r�)r<r�rJrKrArrar�Zytr@r�r�r�r�r�r�r�r=r=r>�transcribe_youtube~sF�� z,BaseTranscriptionPipeline.transcribe_youtubecCs&d|jvrdSd|jvrdS|jdS)N�float16�float32r)r9�r<r=r=r>r:�s z*BaseTranscriptionPipeline.get_compute_typecCs&|jdkrtt�d��Stt�d��S)N�cuda�cpu)r7r1�ctranslate2�get_supported_compute_typesr�r=r=r>r8�s z4BaseTranscriptionPipeline.get_available_compute_typecCsf|jdur |`d|_|jdkrtj��tj��|jdkr-tj��tj��tj��t � �dS)z(Offload the model and free up the memoryNr��xpu)r,r7�torchr��empty_cache�reset_max_memory_allocatedr��reset_accumulated_memory_stats�reset_peak_memory_stats�gc�collectr�r=r=r>rq�s z!BaseTranscriptionPipeline.offload�elapsed_timecCsbt|d�\}}t|d�\}}d}|r||�d�7}|r"||�d�7}t|�}||�d�7}|��S)z� Get {hours} {minutes} {seconds} time format string Parameters ---------- elapsed_time: str Elapsed time for transcription Returns ---------- Time format string i�<r�z hours z minutes z seconds)�divmod�round�strip)r��hours�rem�minutes�secondsZtime_strr=r=r>r��sz%BaseTranscriptionPipeline.format_timecCs<tj��rdStj��rdStjj��rt��sdSdSdS)Nr�r�r��mps)r�r��is_availabler��backendsr�r�is_sparse_api_supportedr=r=r=r>r6�s z$BaseTranscriptionPipeline.get_devicecCsdtjj��sdSzt�d�}tjt�ddgddgg�t�ddg�d|d�}Wd Sty1YdSw) NFr�rrQrP�)�r�)�indicesr4rtr7T)r�r�r�r�r7�sparse_coo_tensor�tensorr�)r7Z sparse_tensorr=r=r>r� s ��z1BaseTranscriptionPipeline.is_sparse_api_supported� file_pathscCs0|sdS|D]}|rtj�|�rt�|�qdS)zRemove gradio cached filesN)r%r)r�r�)r�r�r=r=r>�remove_input_filess ��z,BaseTranscriptionPipeline.remove_input_filesracCs�|jjdurn|jjtkrd|j_ndd�tjj��D�}||jj|j_|jjtkr/d|j_|jjtkr9d|j_|jj tkrCd|j_ |jj tkrMd|j_ |jjtkrWd|j_|jj tkrad|j_ |jjtkrmtd�|j_|S)z� Validate gradio specific values that can't be displayed as None in the UI. Related issue : https://github.com/gradio-app/gradio/issues/8723 NcSsi|]\}}||�qSr=r=)r��key�valuer=r=r>� <dictcomp>0szDBaseTranscriptionPipeline.validate_gradio_values.<locals>.<dictcomp>�inf)r.�lang�AUTOMATIC_DETECTIONr2r3r��initial_prompt�GRADIO_NONE_STR�prefix�hotwords�max_new_tokens�GRADIO_NONE_NUMBER_MIN�hallucination_silence_threshold�language_detection_thresholdr(rY�GRADIO_NONE_NUMBER_MAX�float)raZlanguage_code_dictr=r=r>rf%s* z0BaseTranscriptionPipeline.validate_gradio_valuescCs�tt�}|��}i|�|�}||dd<||dd<|d�dd�}|r1t|t�r1t|�|dd<|d�dd�durDt��|dd<nt j j}||dd|dd<|d�dtd ��td �krht |dd<|duru|rwt|t�dSdSdS) z!Cache parameters to the yaml filer.rKrJ�suppress_tokensNr�r(rYr�)rr�to_dictr|r�r1r�r��unwrapr.r2r3r�r�r)rarJrKZ cached_paramsZparam_to_cacheZcached_yamlZ supress_tokenZ language_dictr=r=r>r~Cs"�z*BaseTranscriptionPipeline.cache_parametersrS�new_sample_raterTcCsTt|t�r t�|�\}}n |durtd��t�|�}tjj||d�}||�� }|S)z=Resamples audio to 16k sample rate, standard on Whisper modelNz@original_sample_rate must be provided when audio is numpy array.)Z orig_freqZnew_freq) r�r�� torchaudio�load� ValueErrorr�� from_numpy� transformsZResample�numpy)r@r�rT� resamplerZresampled_audior=r=r>roas z(BaseTranscriptionPipeline.resample_audio)rIT)rSN)*�__name__� __module__�__qualname__r rrrr�r?rr��Progressrr�np�ndarray�OptionalrrErH�boolrrrcr�rsr�r�r�r:r8rq�staticmethodr�r6r�r�rdrfr~�intror=r=r=r>r s�� j�� @�� G ��r)6r%r.r��gradior�r��abcrr�typingrrrrrr�r�r �faster_whisper.vadr r��copyrrbZmodules.uvr.music_separatorr�modules.utils.pathsr rrrr�modules.utils.constants�modules.utils.loggerrZmodules.utils.subtitle_manager�modules.utils.youtube_managerrr�modules.utils.files_managerrrrrrZmodules.utils.audio_managerr�modules.whisper.data_classesZmodules.diarize.diarizerrZmodules.vad.silero_vadrrwrr=r=r=r>�<module>s4