diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/Subtitle-Translator.iml b/.idea/Subtitle-Translator.iml new file mode 100644 index 0000000..f639c26 --- /dev/null +++ b/.idea/Subtitle-Translator.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..9875417 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..ca1f174 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/st.py b/st.py new file mode 100755 index 0000000..651d265 --- /dev/null +++ b/st.py @@ -0,0 +1,75 @@ +#!/usr/local/lib/pytorch-venv/bin/python3 +from idlelib.iomenu import encoding + +import ctranslate2, transformers +from huggingface_hub import snapshot_download +import os +import pysrt +import chardet + + +class Translator: + def __init__(self): + model_path = snapshot_download("zenoverflow/madlad400-10b-mt-ct2-int8-float16") + print("\n", end="") + self.translator = ctranslate2.Translator(model_path, device="cuda") + self.tokenizer = transformers.T5Tokenizer.from_pretrained(model_path) + + def translate(self, text): + target_lang_code = "fa" + input_text = f"<2{target_lang_code}> {text}" + input_tokens = self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(input_text)) + results = self.translator.translate_batch([input_tokens]) + output_tokens = results[0].hypotheses[0] + output_text = self.tokenizer.decode(self.tokenizer.convert_tokens_to_ids(output_tokens)) + return output_text + + +class Subtitle: + def __init__(self, sub_dir, filename): + self.sub_dir = sub_dir + self.filename = filename + self.file_path = os.path.join(self.sub_dir, self.filename) + + def translate_subtitle(self, encoding='UTF-8'): + subtitles = pysrt.open(self.file_path, encoding=encoding) + for sub in subtitles: + subtitle_line = sub.text.replace('\n', ' ') + translated_text = self.translate_text(subtitle_line) + sub.text = translated_text + translated_dir = os.path.join(self.sub_dir, 'translated') + if not os.path.exists(translated_dir): + os.makedirs(translated_dir) + output_srt_file_path = os.path.join(translated_dir, self.filename) + subtitles.save(output_srt_file_path, encoding='UTF-8') + return output_srt_file_path + + def translate_text(self, text): + translated_text = translator.translate(text) + print(translated_text) + return translated_text + + +def check_encoding(subtitle_file): + with open(subtitle_file, 'rb') as f: + rawdata = f.read() + result = chardet.detect(rawdata) + sub_encoding = result['encoding'] + return sub_encoding + + +if __name__ == '__main__': + translator = Translator() + home_dir = os.path.expanduser('~') + directory = os.path.join(home_dir, 'Documents', 'Subtitles') + files = os.listdir(directory) + srt_files = [f for f in files if f.endswith('.srt')] + for srt_file in srt_files: + subtitle = Subtitle(directory, srt_file) + subtitle_full_path = os.path.join(directory, srt_file) + subtitle_encoding = check_encoding(subtitle_full_path) + translated_file = subtitle.translate_subtitle(subtitle_encoding) + if translated_file: + print(translated_file) + else: + print("No translation")