diff --git a/apg_guidance.py b/apg_guidance.py new file mode 100644 index 0000000..0d398c5 --- /dev/null +++ b/apg_guidance.py @@ -0,0 +1,65 @@ +import torch + + +class MomentumBuffer: + def __init__(self, momentum: float = -0.75): + self.momentum = momentum + self.running_average = 0 + + def update(self, update_value: torch.Tensor): + new_average = self.momentum * self.running_average + self.running_average = update_value + new_average + + +def project( + v0: torch.Tensor, # [B, C, H, W] + v1: torch.Tensor, # [B, C, H, W] + dims=[-1, -2], +): + dtype = v0.dtype + v0, v1 = v0.double(), v1.double() + v1 = torch.nn.functional.normalize(v1, dim=dims) + v0_parallel = (v0 * v1).sum(dim=dims, keepdim=True) * v1 + v0_orthogonal = v0 - v0_parallel + return v0_parallel.to(dtype), v0_orthogonal.to(dtype) + + +def apg_forward( + pred_cond: torch.Tensor, # [B, C, H, W] + pred_uncond: torch.Tensor, # [B, C, H, W] + guidance_scale: float, + momentum_buffer: MomentumBuffer = None, + eta: float = 0.0, + norm_threshold: float = 2.5, + dims=[-1, -2], +): + diff = pred_cond - pred_uncond + # orig_cfg_guided = pred_uncond + guidance_scale * (pred_cond - pred_uncond) + # print("======== 新的一轮 =========") + # print("原来的diff", "min:", diff.min(), "max:", diff.max(), "mean:", diff.mean(), "std:", diff.std(), f"cfg会乘上{guidance_scale=}") + # print("如果跑cfg orig_cfg_guided", "min:", orig_cfg_guided.min(), "max:", orig_cfg_guided.max(), "mean:", orig_cfg_guided.mean(), "std:", orig_cfg_guided.std()) + if momentum_buffer is not None: + momentum_buffer.update(diff) + diff = momentum_buffer.running_average + # print("跑完momentum_buffer后", "min:", diff.min(), "max:", diff.max(), "mean:", diff.mean(), "std:", diff.std(), f"cfg会乘上{guidance_scale=}") + + if norm_threshold > 0: + ones = torch.ones_like(diff) + diff_norm = diff.norm(p=2, dim=dims, keepdim=True) + # print("diff_norm", diff_norm) + # 只有比1大的时候(爆音)才会进行缩放 + scale_factor = torch.minimum(ones, norm_threshold / diff_norm) + diff = diff * scale_factor + # print("跑完norm_threshold scale factor后", "min:", diff.min(), "max:", diff.max(), "mean:", diff.mean(), "std:", diff.std()) + + diff_parallel, diff_orthogonal = project(diff, pred_cond, dims) + # print("跑完project后, diff_parallel", "min:", diff_parallel.min(), "max:", diff_parallel.max(), "mean:", diff_parallel.mean(), "std:", diff_parallel.std()) + normalized_update = diff_orthogonal + eta * diff_parallel + # print("跑完normalized_update后", "min:", normalized_update.min(), "max:", normalized_update.max(), "mean:", normalized_update.mean(), "std:", normalized_update.std()) + pred_guided = pred_cond + (guidance_scale - 1) * normalized_update + # print("最终pred_guided", "min:", pred_guided.min(), "max:", pred_guided.max(), "mean:", pred_guided.mean(), "std:", pred_guided.std()) + return pred_guided + + +def cfg_forward(cond_output, uncond_output, cfg_strength): + return uncond_output + cfg_strength * (cond_output - uncond_output) diff --git a/demo_infer_pipeline_text2music_v3.py b/demo_infer_pipeline_text2music_v3.py new file mode 100644 index 0000000..9c5c705 --- /dev/null +++ b/demo_infer_pipeline_text2music_v3.py @@ -0,0 +1,390 @@ +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("--checkpoint_path", type=str, default="checkpoints/epoch=22-step=460k_pretrained_ft_80k.ckpt") +parser.add_argument("--port", type=int, default=7862) +parser.add_argument("--device_id", type=int, default=0) +parser.add_argument("--share", action='store_true', default=False) +parser.add_argument("--bf16", action='store_true', default=True) +parser.add_argument("--hide_dataset_sampler", action='store_true', default=False) + +args = parser.parse_args() + +import os + +os.environ["CUDA_VISIBLE_DEVICES"] = str(args.device_id) + +import torch +import torchaudio +import torch.nn.functional as F +from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import retrieve_timesteps +from diffusers.utils.torch_utils import randn_tensor +from pathlib import Path +import time +from tqdm import tqdm +from loguru import logger +import json +from ui.auth import same_auth +from ui.text2music_large_lyric_components_v3 import create_main_demo_ui + +from models.lyrics_utils.lyric_tokenizer import VoiceBpeTokenizer +from schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler +from schedulers.scheduling_flow_match_heun_discrete import FlowMatchHeunDiscreteScheduler +from apg_guidance import apg_forward, MomentumBuffer, cfg_forward +from language_segmentation import LangSegment +import random +import re + + +logger.add("demo_v3.log", level="INFO") + + +def ensure_directory_exists(directory): + directory = str(directory) + if not os.path.exists(directory): + os.makedirs(directory) + +VALID_STRUCTURE_PATTERN = ["hook", "break", "pre-chorus", "solo", "inst", "end", "outro", "bridge", "chorus", "verse", "intro", "start"] + +def is_structure_tag(lin): + lin = lin.lower() + pattern = re.compile(r"\[.*\]") + for tag in VALID_STRUCTURE_PATTERN: + if tag in lin and pattern.match(lin): + return True + return False + + +# 重新tokenize的逻辑 +SUPPORT_LANGUAGES = { + "en": 259, "de": 260, "fr": 262, "es": 284, "it": 285, + "pt": 286, "pl": 294, "tr": 295, "ru": 267, "cs": 293, + "nl": 297, "ar": 5022, "zh": 5023, "ja": 5412, "hu": 5753, + "ko": 6152, "hi": 6680 +} + +structure_pattern = re.compile(r"\[.*?\]") + + +class InferDemo: + def __init__(self, args): + logger.info(f"init model with checkpoint: {args.checkpoint_path}") + model_checkpoint_name = "AceFlow3_250401" + Path(args.checkpoint_path).stem + if args.bf16: + self.dtype = torch.bfloat16 + else: + self.dtype = torch.float32 + self.device = "cuda:0" + + self.model_checkpoint_name = model_checkpoint_name + + self.checkpoint_path = "" + + lang_segment = LangSegment() + + lang_segment.setfilters([ + 'af', 'am', 'an', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'dz', 'el', + 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fo', 'fr', 'ga', 'gl', 'gu', 'he', 'hi', 'hr', 'ht', 'hu', 'hy', + 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg', + 'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'nb', 'ne', 'nl', 'nn', 'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', + 'ro', 'ru', 'rw', 'se', 'si', 'sk', 'sl', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', 'th', 'tl', 'tr', 'ug', 'uk', + 'ur', 'vi', 'vo', 'wa', 'xh', 'zh', 'zu' + ]) + self.lang_segment = lang_segment + self.lyric_tokenizer = VoiceBpeTokenizer() + + def reload_model(self, checkpoint_path): + if checkpoint_path in self.checkpoint_path or self.checkpoint_path == checkpoint_path: + return + + logger.info(f"re-init model with checkpoint: {checkpoint_path}") + model_checkpoint_name = "AceFlow3_250401" + Path(checkpoint_path).stem + checkpoint = torch.load(checkpoint_path, map_location='cpu') + + from main_text2music_large_sana_dcae_0331_finetune import Pipeline + + model = Pipeline(infer=True, train=False) + model.load_state_dict(checkpoint, strict=False) + + self.model = model.eval().to(self.device).to(self.dtype) + self.model_checkpoint_name = model_checkpoint_name + self.checkpoint_path = checkpoint_path + self.tokenizer = VoiceBpeTokenizer() + + def save_wav_file(self, target_wav, idx, sample_rate=48000): + base_path = f"./test_results/{self.model_checkpoint_name}/demo_outputs" + ensure_directory_exists(base_path) + # 压缩成mp3 + output_path_flac = f"{base_path}/output_{time.strftime('%Y%m%d%H%M%S')}_{idx}.flac" + target_wav = target_wav.float() + torchaudio.save(output_path_flac, target_wav, sample_rate=sample_rate, format='flac', backend="ffmpeg", compression=torchaudio.io.CodecConfig(bit_rate=320000)) + return output_path_flac + + def set_seeds(self, batch_size, manual_seeds=None): + seeds = None + if manual_seeds is not None: + if isinstance(manual_seeds, str): + if "," in manual_seeds: + seeds = list(map(int, manual_seeds.split(","))) + elif manual_seeds.isdigit(): + seeds = int(manual_seeds) + + random_generators = [torch.Generator(device=self.device) for _ in range(batch_size)] + actual_seeds = [] + for i in range(batch_size): + seed = None + if seeds is None: + seed = torch.randint(0, 2**32, (1,)).item() + if isinstance(seeds, int): + seed = seeds + if isinstance(seeds, list): + seed = seeds[i] + logger.info(f"batch idx: {i}, seed: {seed}") + random_generators[i].manual_seed(seed) + actual_seeds.append(seed) + return random_generators, actual_seeds + + def latents2audio(self, latents, target_wav_duration_second=30, sample_rate=48000): + output_audio_paths = [] + bs = latents.shape[0] + audio_lengths = [target_wav_duration_second * sample_rate] * bs + pred_latents = latents + with torch.no_grad(): + _, pred_wavs = self.model.vae.decode(pred_latents, sr=sample_rate) + pred_wavs = [pred_wav.cpu().float() for pred_wav in pred_wavs] + for i in tqdm(range(bs)): + output_audio_path = self.save_wav_file(pred_wavs[i], i, sample_rate=sample_rate) + output_audio_paths.append(output_audio_path) + return output_audio_paths + + def get_lang(self, text): + language = "en" + try: + langs = self.lang_segment.getTexts(text) + langCounts = self.lang_segment.getCounts() + language = langCounts[0][0] + if len(langCounts) > 1 and language == "en": + language = langCounts[1][0] + except Exception as err: + language = "en" + return language + + def tokenize_lyrics(self, lyrics, debug=False): + lines = lyrics.split("\n") + lyric_token_idx = [261] + for line in lines: + line = line.strip() + if not line: + lyric_token_idx += [2] + continue + + lang = self.get_lang(line) + + if lang not in SUPPORT_LANGUAGES: + lang = "en" + if "zh" in lang: + lang = "zh" + if "spa" in lang: + lang = "es" + + try: + if structure_pattern.match(line): + token_idx = self.lyric_tokenizer.encode(line, "en") + else: + token_idx = self.lyric_tokenizer.encode(line, lang) + if debug: + toks = self.lyric_tokenizer.batch_decode([[tok_id] for tok_id in token_idx]) + logger.info(f"debbug {line} --> {lang} --> {toks}") + lyric_token_idx = lyric_token_idx + token_idx + [2] + except Exception as e: + print("tokenize error", e, "for line", line, "major_language", lang) + return lyric_token_idx + + @torch.no_grad() + def text2music_diffusion_process( + self, + duration, + encoder_text_hidden_states, + text_attention_mask, + speaker_embds, + lyric_token_ids, + lyric_mask, + random_generators=None, + infer_steps=60, + guidance_scale=15.0, + omega_scale=10.0, + scheduler_type="euler", + cfg_type="apg", + ): + + logger.info("cfg_type: {}, guidance_scale: {}, omega_scale: {}".format(cfg_type, guidance_scale, omega_scale)) + do_classifier_free_guidance = True + if guidance_scale == 0.0 or guidance_scale == 1.0: + do_classifier_free_guidance = False + + device = encoder_text_hidden_states.device + dtype = encoder_text_hidden_states.dtype + bsz = encoder_text_hidden_states.shape[0] + + if scheduler_type == "euler": + scheduler = FlowMatchEulerDiscreteScheduler( + num_train_timesteps=1000, + shift=3.0, + ) + elif scheduler_type == "heun": + scheduler = FlowMatchHeunDiscreteScheduler( + num_train_timesteps=1000, + shift=3.0, + ) + frame_length = int(duration * 44100 / 512 / 8) + timesteps, num_inference_steps = retrieve_timesteps(scheduler, num_inference_steps=infer_steps, device=device, timesteps=None) + target_latents = randn_tensor(shape=(bsz, 8, 16, frame_length), generator=random_generators, device=device, dtype=dtype) + attention_mask = torch.ones(bsz, frame_length, device=device, dtype=dtype) + + if do_classifier_free_guidance: + attention_mask = torch.cat([attention_mask] * 2, dim=0) + encoder_text_hidden_states = torch.cat([encoder_text_hidden_states, torch.zeros_like(encoder_text_hidden_states)], 0) + text_attention_mask = torch.cat([text_attention_mask] * 2, dim=0) + + speaker_embds = torch.cat([speaker_embds, torch.zeros_like(speaker_embds)], 0) + + lyric_token_ids = torch.cat([lyric_token_ids, torch.zeros_like(lyric_token_ids)], 0) + lyric_mask = torch.cat([lyric_mask, torch.zeros_like(lyric_mask)], 0) + + momentum_buffer = MomentumBuffer() + + for i, t in tqdm(enumerate(timesteps), total=num_inference_steps): + # expand the latents if we are doing classifier free guidance + latents = target_latents + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + timestep = t.expand(latent_model_input.shape[0]) + noise_pred = self.model.transformers( + hidden_states=latent_model_input, + attention_mask=attention_mask, + encoder_text_hidden_states=encoder_text_hidden_states, + text_attention_mask=text_attention_mask, + speaker_embeds=speaker_embds, + lyric_token_idx=lyric_token_ids, + lyric_mask=lyric_mask, + timestep=timestep, + ).sample + + if do_classifier_free_guidance: + noise_pred_with_cond, noise_pred_uncond = noise_pred.chunk(2) + if cfg_type == "apg": + noise_pred = apg_forward( + pred_cond=noise_pred_with_cond, + pred_uncond=noise_pred_uncond, + guidance_scale=guidance_scale, + momentum_buffer=momentum_buffer, + ) + else: + noise_pred = cfg_forward( + cond_output=noise_pred_with_cond, + uncond_output=noise_pred_uncond, + cfg_strength=guidance_scale, + ) + + target_latents = scheduler.step(model_output=noise_pred, timestep=t, sample=target_latents, return_dict=False, omega=omega_scale)[0] + + return target_latents + + @torch.no_grad() + def process_text2music( + self, + audio_duration, + prompt, + lyrics, + input_params_json, + selected_checkpoint, + scheduler_type, + cfg_type, + infer_step, + guidance_scale, + omega_scale, + manual_seeds, + ): + # 1 check if need to reload model + if selected_checkpoint is not None and self.checkpoint_path != selected_checkpoint: + self.reload_model(selected_checkpoint) + + batch_size = 2 + + # 2 set seed + random_generators, actual_seeds = self.set_seeds(batch_size, manual_seeds) + + # 8 x 16 x T//8 + # 4 prompt + texts = [prompt] + encoder_text_hidden_states, text_attention_mask = self.model.lyric_processor.get_text_embeddings(texts, self.device) + encoder_text_hidden_states = encoder_text_hidden_states.repeat(batch_size, 1, 1) + text_attention_mask = text_attention_mask.repeat(batch_size, 1) + + speaker_embeds = torch.zeros(batch_size, 512).to(self.device).to(self.dtype) + + # 6 lyric + lyric_token_idx = torch.tensor([0]).repeat(batch_size, 1).to(self.device).long() + lyric_mask = torch.tensor([0]).repeat(batch_size, 1).to(self.device).long() + if len(lyrics) > 0: + lyric_token_idx = self.tokenize_lyrics(lyrics, debug=True) + lyric_mask = [1] * len(lyric_token_idx) + lyric_token_idx = torch.tensor(lyric_token_idx).unsqueeze(0).to(self.device).repeat(batch_size, 1) + lyric_mask = torch.tensor(lyric_mask).unsqueeze(0).to(self.device).repeat(batch_size, 1) + + if audio_duration <= 0: + audio_duration = random.uniform(30.0, 300.0) + logger.info(f"random audio duration: {audio_duration}") + + # 7. encode + target_latents = self.text2music_diffusion_process( + audio_duration, + encoder_text_hidden_states=encoder_text_hidden_states, + text_attention_mask=text_attention_mask, + speaker_embds=speaker_embeds, + lyric_token_ids=lyric_token_idx, + lyric_mask=lyric_mask, + guidance_scale=guidance_scale, + omega_scale=omega_scale, + infer_steps=infer_step, + random_generators=random_generators, + scheduler_type=scheduler_type, + cfg_type=cfg_type, + ) + + # 8 latents2audio + output_paths = self.latents2audio(latents=target_latents, target_wav_duration_second=audio_duration) + if input_params_json is None: + input_params_json = {} + input_params_json["prompt"] = prompt + input_params_json["lyrics"] = lyrics + input_params_json["infer_steps"] = infer_step + input_params_json["guidance_scale"] = guidance_scale + input_params_json["manual_seeds"] = manual_seeds + input_params_json["actual_seeds"] = actual_seeds + input_params_json["checkpoint_path"] = self.checkpoint_path + input_params_json["omega_scale"] = omega_scale + input_params_json["scheduler_type"] = scheduler_type + input_params_json["cfg_type"] = cfg_type + input_params_json["audio_duration"] = audio_duration + logger.info(json.dumps(input_params_json, indent=4, ensure_ascii=False)) + + return output_paths + [input_params_json] + + +def main(args): + + model_demo = InferDemo(args) + + demo = create_main_demo_ui( + checkpoint_path=args.checkpoint_path, + text2music_process_func=model_demo.process_text2music, + ) + demo.launch( + server_name="0.0.0.0", + server_port=args.port, + auth=same_auth, + share=args.share + ) + + +if __name__ == "__main__": + main(args) diff --git a/language_segmentation/LangSegment.py b/language_segmentation/LangSegment.py new file mode 100644 index 0000000..9face63 --- /dev/null +++ b/language_segmentation/LangSegment.py @@ -0,0 +1,866 @@ +""" +This file bundles language identification functions. + +Modifications (fork): Copyright (c) 2021, Adrien Barbaresi. + +Original code: Copyright (c) 2011 Marco Lui . +Based on research by Marco Lui and Tim Baldwin. + +See LICENSE file for more info. +https://github.com/adbar/py3langid + +Projects: +https://github.com/juntaosun/LangSegment +""" + +import os +import re +import sys +import numpy as np +from collections import Counter +from collections import defaultdict + +# import langid +# import py3langid as langid +# pip install py3langid==0.2.2 + +# 启用语言预测概率归一化,概率预测的分数。因此,实现重新规范化 产生 0-1 范围内的输出。 +# langid disables probability normalization by default. For command-line usages of , it can be enabled by passing the flag. +# For probability normalization in library use, the user must instantiate their own . An example of such usage is as follows: +from py3langid.langid import LanguageIdentifier, MODEL_FILE + +# Digital processing +try:from .utils.num import num2str +except ImportError: + try:from utils.num import num2str + except ImportError as e: + raise e + +# ----------------------------------- +# 更新日志:新版本分词更加精准。 +# Changelog: The new version of the word segmentation is more accurate. +# チェンジログ:新しいバージョンの単語セグメンテーションはより正確です。 +# Changelog: 분할이라는 단어의 새로운 버전이 더 정확합니다. +# ----------------------------------- + + +# Word segmentation function: +# automatically identify and split the words (Chinese/English/Japanese/Korean) in the article or sentence according to different languages, +# making it more suitable for TTS processing. +# This code is designed for front-end text multi-lingual mixed annotation distinction, multi-language mixed training and inference of various TTS projects. +# This processing result is mainly for (Chinese = zh, Japanese = ja, English = en, Korean = ko), and can actually support up to 97 different language mixing processing. + +#=========================================================================================================== +#分かち書き機能:文章や文章の中の例えば(中国語/英語/日本語/韓国語)を、異なる言語で自動的に認識して分割し、TTS処理により適したものにします。 +#このコードは、さまざまなTTSプロジェクトのフロントエンドテキストの多言語混合注釈区別、多言語混合トレーニング、および推論のために特別に作成されています。 +#=========================================================================================================== +#(1)自動分詞:「韓国語では何を読むのですかあなたの体育の先生は誰ですか?今回の発表会では、iPhone 15シリーズの4機種が登場しました」 +#(2)手动分词:“あなたの名前は佐々木ですか?ですか?” +#この処理結果は主に(中国語=ja、日本語=ja、英語=en、韓国語=ko)を対象としており、実際には最大97の異なる言語の混合処理をサポートできます。 +#=========================================================================================================== + +#=========================================================================================================== +# 단어 분할 기능: 기사 또는 문장에서 단어(중국어/영어/일본어/한국어)를 다른 언어에 따라 자동으로 식별하고 분할하여 TTS 처리에 더 적합합니다. +# 이 코드는 프런트 엔드 텍스트 다국어 혼합 주석 분화, 다국어 혼합 교육 및 다양한 TTS 프로젝트의 추론을 위해 설계되었습니다. +#=========================================================================================================== +# (1) 자동 단어 분할: "한국어로 무엇을 읽습니까? 스포츠 씨? 이 컨퍼런스는 4개의 iPhone 15 시리즈 모델을 제공합니다." +# (2) 수동 참여: "이름이 Saki입니까? ?" +# 이 처리 결과는 주로 (중국어 = zh, 일본어 = ja, 영어 = en, 한국어 = ko)를 위한 것이며 실제로 혼합 처리를 위해 최대 97개의 언어를 지원합니다. +#=========================================================================================================== + +# =========================================================================================================== +# 分词功能:将文章或句子里的例如(中/英/日/韩),按不同语言自动识别并拆分,让它更适合TTS处理。 +# 本代码专为各种 TTS 项目的前端文本多语种混合标注区分,多语言混合训练和推理而编写。 +# =========================================================================================================== +# (1)自动分词:“韩语中的오빠读什么呢?あなたの体育の先生は誰ですか? 此次发布会带来了四款iPhone 15系列机型” +# (2)手动分词:“你的名字叫佐々木?吗?” +# 本处理结果主要针对(中文=zh , 日文=ja , 英文=en , 韩语=ko), 实际上可支持多达 97 种不同的语言混合处理。 +# =========================================================================================================== + + +# 手动分词标签规范:<语言标签>文本内容 +# 수동 단어 분할 태그 사양: <언어 태그> 텍스트 내용 +# Manual word segmentation tag specification: text content +# 手動分詞タグ仕様:<言語タグ>テキスト内容 +# =========================================================================================================== +# For manual word segmentation, labels need to appear in pairs, such as: +# 如需手动分词,标签需要成对出现,例如:“佐々木” 或者 “佐々木” +# 错误示范:“你的名字叫佐々木。” 此句子中出现的单个标签将被忽略,不会处理。 +# Error demonstration: "Your name is 佐々木。" Single tags that appear in this sentence will be ignored and will not be processed. +# =========================================================================================================== + + +# =========================================================================================================== +# 语音合成标记语言 SSML , 这里只支持它的标签(非 XML)Speech Synthesis Markup Language SSML, only its tags are supported here (not XML) +# 想支持更多的 SSML 标签?欢迎 PR! Want to support more SSML tags? PRs are welcome! +# 说明:除了中文以外,它也可改造成支持多语种 SSML ,不仅仅是中文。 +# Note: In addition to Chinese, it can also be modified to support multi-language SSML, not just Chinese. +# =========================================================================================================== +# 中文实现:Chinese implementation: +# 【SSML】=中文大写数字读法(单字) +# 【SSML】=数字转成中文电话号码大写汉字(单字) +# 【SSML】=按金额发音。 +# 【SSML】=按日期发音。支持 2024年08月24, 2024/8/24, 2024-08, 08-24, 24 等输入。 +# =========================================================================================================== +class LangSSML: + + def __init__(self): + # 纯数字 + self._zh_numerals_number = { + '0': '零', + '1': '一', + '2': '二', + '3': '三', + '4': '四', + '5': '五', + '6': '六', + '7': '七', + '8': '八', + '9': '九' + } + + # 将2024/8/24, 2024-08, 08-24, 24 标准化“年月日” + # Standardize 2024/8/24, 2024-08, 08-24, 24 to "year-month-day" + def _format_chinese_data(self, date_str:str): + # 处理日期格式 + input_date = date_str + if date_str is None or date_str.strip() == "":return "" + date_str = re.sub(r"[\/\._|年|月]","-",date_str) + date_str = re.sub(r"日",r"",date_str) + date_arrs = date_str.split(' ') + if len(date_arrs) == 1 and ":" in date_arrs[0]: + time_str = date_arrs[0] + date_arrs = [] + else: + time_str = date_arrs[1] if len(date_arrs) >=2 else "" + def nonZero(num,cn,func=None): + if func is not None:num=func(num) + return f"{num}{cn}" if num is not None and num != "" and num != "0" else "" + f_number = self.to_chinese_number + f_currency = self.to_chinese_currency + # year, month, day + year_month_day = "" + if len(date_arrs) > 0: + year, month, day = "","","" + parts = date_arrs[0].split('-') + if len(parts) == 3: # 格式为 YYYY-MM-DD + year, month, day = parts + elif len(parts) == 2: # 格式为 MM-DD 或 YYYY-MM + if len(parts[0]) == 4: # 年-月 + year, month = parts + else:month, day = parts # 月-日 + elif len(parts[0]) > 0: # 仅有月-日或年 + if len(parts[0]) == 4: + year = parts[0] + else:day = parts[0] + year,month,day = nonZero(year,"年",f_number),nonZero(month,"月",f_currency),nonZero(day,"日",f_currency) + year_month_day = re.sub(r"([年|月|日])+",r"\1",f"{year}{month}{day}") + # hours, minutes, seconds + time_str = re.sub(r"[\/\.\-:_]",":",time_str) + time_arrs = time_str.split(":") + hours, minutes, seconds = "","","" + if len(time_arrs) == 3: # H/M/S + hours, minutes, seconds = time_arrs + elif len(time_arrs) == 2:# H/M + hours, minutes = time_arrs + elif len(time_arrs[0]) > 0:hours = f'{time_arrs[0]}点' # H + if len(time_arrs) > 1: + hours, minutes, seconds = nonZero(hours,"点",f_currency),nonZero(minutes,"分",f_currency),nonZero(seconds,"秒",f_currency) + hours_minutes_seconds = re.sub(r"([点|分|秒])+",r"\1",f"{hours}{minutes}{seconds}") + output_date = f"{year_month_day}{hours_minutes_seconds}" + return output_date + + # 【SSML】number=中文大写数字读法(单字) + # Chinese Numbers(single word) + def to_chinese_number(self, num:str): + pattern = r'(\d+)' + zh_numerals = self._zh_numerals_number + arrs = re.split(pattern, num) + output = "" + for item in arrs: + if re.match(pattern,item): + output += ''.join(zh_numerals[digit] if digit in zh_numerals else "" for digit in str(item)) + else:output += item + output = output.replace(".","点") + return output + + # 【SSML】telephone=数字转成中文电话号码大写汉字(单字) + # Convert numbers to Chinese phone numbers in uppercase Chinese characters(single word) + def to_chinese_telephone(self, num:str): + output = self.to_chinese_number(num.replace("+86","")) # zh +86 + output = output.replace("一","幺") + return output + + # 【SSML】currency=按金额发音。 + # Digital processing from GPT_SoVITS num.py (thanks) + def to_chinese_currency(self, num:str): + pattern = r'(\d+)' + arrs = re.split(pattern, num) + output = "" + for item in arrs: + if re.match(pattern,item): + output += num2str(item) + else:output += item + output = output.replace(".","点") + return output + + # 【SSML】date=按日期发音。支持 2024年08月24, 2024/8/24, 2024-08, 08-24, 24 等输入。 + def to_chinese_date(self, num:str): + chinese_date = self._format_chinese_data(num) + return chinese_date + + +class LangSegment: + + def __init__(self): + + self.langid = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True) + + self._text_cache = None + self._text_lasts = None + self._text_langs = None + self._lang_count = None + self._lang_eos = None + + # 可自定义语言匹配标签:カスタマイズ可能な言語対応タグ:사용자 지정 가능한 언어 일치 태그: + # Customizable language matching tags: These are supported,이 표현들은 모두 지지합니다 + # 你好 , 佐々木 , OK , 오빠 这些写法均支持 + self.SYMBOLS_PATTERN = r'(<([a-zA-Z|-]*)>(.*?)<\/*[a-zA-Z|-]*>)' + + # 语言过滤组功能, 可以指定保留语言。不在过滤组中的语言将被清除。您可随心搭配TTS语音合成所支持的语言。 + # 언어 필터 그룹 기능을 사용하면 예약된 언어를 지정할 수 있습니다. 필터 그룹에 없는 언어는 지워집니다. TTS 텍스트에서 지원하는 언어를 원하는 대로 일치시킬 수 있습니다. + # 言語フィルターグループ機能では、予約言語を指定できます。フィルターグループに含まれていない言語はクリアされます。TTS音声合成がサポートする言語を自由に組み合わせることができます。 + # The language filter group function allows you to specify reserved languages. + # Languages not in the filter group will be cleared. You can match the languages supported by TTS Text To Speech as you like. + # 排名越前,优先级越高,The higher the ranking, the higher the priority,ランキングが上位になるほど、優先度が高くなります。 + + # 系统默认过滤器。System default filter。(ISO 639-1 codes given) + # ---------------------------------------------------------------------------------------------------------------------------------- + # "zh"中文=Chinese ,"en"英语=English ,"ja"日语=Japanese ,"ko"韩语=Korean ,"fr"法语=French ,"vi"越南语=Vietnamese , "ru"俄语=Russian + # "th"泰语=Thai + # ---------------------------------------------------------------------------------------------------------------------------------- + self.DEFAULT_FILTERS = ["zh", "ja", "ko", "en"] + + # 用户可自定义过滤器。User-defined filters + self.Langfilters = self.DEFAULT_FILTERS[:] # 创建副本 + + # 合并文本 + self.isLangMerge = True + + # 试验性支持:您可自定义添加:"fr"法语 , "vi"越南语。Experimental: You can customize to add: "fr" French, "vi" Vietnamese. + # 请使用API启用:self.setfilters(["zh", "en", "ja", "ko", "fr", "vi" , "ru" , "th"]) # 您可自定义添加,如:"fr"法语 , "vi"越南语。 + + # 预览版功能,自动启用或禁用,无需设置 + # Preview feature, automatically enabled or disabled, no settings required + self.EnablePreview = False + + # 除此以外,它支持简写过滤器,只需按不同语种任意组合即可。 + # In addition to that, it supports abbreviation filters, allowing for any combination of different languages. + # 示例:您可以任意指定多种组合,进行过滤 + # Example: You can specify any combination to filter + + # 中/日语言优先级阀值(评分范围为 0 ~ 1):评分低于设定阀值 <0.89 时,启用 filters 中的优先级。\n + # 중/일본어 우선 순위 임계값(점수 범위 0-1): 점수가 설정된 임계값 <0.89보다 낮을 때 필터에서 우선 순위를 활성화합니다. + # 中国語/日本語の優先度しきい値(スコア範囲0〜1):スコアが設定されたしきい値<0.89未満の場合、フィルターの優先度が有効になります。\n + # Chinese and Japanese language priority threshold (score range is 0 ~ 1): The default threshold is 0.89. \n + # Only the common characters between Chinese and Japanese are processed with confidence and priority. \n + self.LangPriorityThreshold = 0.89 + + # Langfilters = ["zh"] # 按中文识别 + # Langfilters = ["en"] # 按英文识别 + # Langfilters = ["ja"] # 按日文识别 + # Langfilters = ["ko"] # 按韩文识别 + # Langfilters = ["zh_ja"] # 中日混合识别 + # Langfilters = ["zh_en"] # 中英混合识别 + # Langfilters = ["ja_en"] # 日英混合识别 + # Langfilters = ["zh_ko"] # 中韩混合识别 + # Langfilters = ["ja_ko"] # 日韩混合识别 + # Langfilters = ["en_ko"] # 英韩混合识别 + # Langfilters = ["zh_ja_en"] # 中日英混合识别 + # Langfilters = ["zh_ja_en_ko"] # 中日英韩混合识别 + + # 更多过滤组合,请您随意。。。For more filter combinations, please feel free to...... + # より多くのフィルターの組み合わせ、お気軽に。。。더 많은 필터 조합을 원하시면 자유롭게 해주세요. ..... + + # 可选保留:支持中文数字拼音格式,更方便前端实现拼音音素修改和推理,默认关闭 False 。 + # 开启后 True ,括号内的数字拼音格式均保留,并识别输出为:"zh"中文。 + self.keepPinyin = False + + # DEFINITION + self.PARSE_TAG = re.compile(r'(⑥\$*\d+[\d]{6,}⑥)') + + self.LangSSML = LangSSML() + + def _clears(self): + self._text_cache = None + self._text_lasts = None + self._text_langs = None + self._text_waits = None + self._lang_count = None + self._lang_eos = None + + def _is_english_word(self, word): + return bool(re.match(r'^[a-zA-Z]+$', word)) + + def _is_chinese(self, word): + for char in word: + if '\u4e00' <= char <= '\u9fff': + return True + return False + + def _is_japanese_kana(self, word): + pattern = re.compile(r'[\u3040-\u309F\u30A0-\u30FF]+') + matches = pattern.findall(word) + return len(matches) > 0 + + def _insert_english_uppercase(self, word): + modified_text = re.sub(r'(? 0 else None + if symbol is not None:pass + elif preData is not None and preData["symbol"] is None: + if len(clear_text) == 0:language = preData["lang"] + elif is_number == True:language = preData["lang"] + _ , pre_is_number = self._clear_text_number(preData["text"]) + if (preData["lang"] == language): + self._statistics(preData["lang"],text) + text = preData["text"] + text + preData["text"] = text + return preData + elif pre_is_number == True: + text = f'{preData["text"]}{text}' + words.pop() + elif is_number == True: + priority_language = self._get_filters_string()[:2] + if priority_language in "ja-zh-en-ko-fr-vi":language = priority_language + data = {"lang":language,"text": text,"score":score,"symbol":symbol} + filters = self.Langfilters + if filters is None or len(filters) == 0 or "?" in language or \ + language in filters or language in filters[0] or \ + filters[0] == "*" or filters[0] in "alls-mixs-autos": + words.append(data) + self._statistics(data["lang"],data["text"]) + return data + + def _addwords(self, words,language,text,score,symbol=None): + if text == "\n":pass # Keep Line Breaks + elif text is None or len(text.strip()) == 0:return True + if language is None:language = "" + language = language.lower() + if language == 'en':text = self._insert_english_uppercase(text) + # text = re.sub(r'[(())]', ',' , text) # Keep it. + text_waits = self._text_waits + ispre_waits = len(text_waits)>0 + preResult = text_waits.pop() if ispre_waits else None + if preResult is None:preResult = words[-1] if len(words) > 0 else None + if preResult and ("|" in preResult["lang"]): + pre_lang = preResult["lang"] + if language in pre_lang:preResult["lang"] = language = language.split("|")[0] + else:preResult["lang"]=pre_lang.split("|")[0] + if ispre_waits:preResult = self._saveData(words,preResult["lang"],preResult["text"],preResult["score"],preResult["symbol"]) + pre_lang = preResult["lang"] if preResult else None + if ("|" in language) and (pre_lang and not pre_lang in language and not "…" in language):language = language.split("|")[0] + if "|" in language:self._text_waits.append({"lang":language,"text": text,"score":score,"symbol":symbol}) + else:self._saveData(words,language,text,score,symbol) + return False + + def _get_prev_data(self, words): + data = words[-1] if words and len(words) > 0 else None + if data:return (data["lang"] , data["text"]) + return (None,"") + + def _match_ending(self, input , index): + if input is None or len(input) == 0:return False,None + input = re.sub(r'\s+', '', input) + if len(input) == 0 or abs(index) > len(input):return False,None + ending_pattern = re.compile(r'([「」“”‘’"\'::。.!!?.?])') + return ending_pattern.match(input[index]),input[index] + + def _cleans_text(self, cleans_text): + cleans_text = re.sub(r'(.*?)([^\w]+)', r'\1 ', cleans_text) + cleans_text = re.sub(r'(.)\1+', r'\1', cleans_text) + return cleans_text.strip() + + def _mean_processing(self, text:str): + if text is None or (text.strip()) == "":return None , 0.0 + arrs = self._split_camel_case(text).split(" ") + langs = [] + for t in arrs: + if len(t.strip()) <= 3:continue + language, score = self.langid.classify(t) + langs.append({"lang":language}) + if len(langs) == 0:return None , 0.0 + return Counter([item['lang'] for item in langs]).most_common(1)[0][0],1.0 + + def _lang_classify(self, cleans_text): + language, score = self.langid.classify(cleans_text) + # fix: Huggingface is np.float32 + if score is not None and isinstance(score, np.generic) and hasattr(score,"item"): + score = score.item() + score = round(score , 3) + return language, score + + def _get_filters_string(self): + filters = self.Langfilters + return "-".join(filters).lower().strip() if filters is not None else "" + + def _parse_language(self, words , segment): + LANG_JA = "ja" + LANG_ZH = "zh" + LANG_ZH_JA = f'{LANG_ZH}|{LANG_JA}' + LANG_JA_ZH = f'{LANG_JA}|{LANG_ZH}' + language = LANG_ZH + regex_pattern = re.compile(r'([^\w\s]+)') + lines = regex_pattern.split(segment) + lines_max = len(lines) + LANG_EOS =self._lang_eos + for index, text in enumerate(lines): + if len(text) == 0:continue + EOS = index >= (lines_max - 1) + nextId = index + 1 + nextText = lines[nextId] if not EOS else "" + nextPunc = len(re.sub(regex_pattern,'',re.sub(r'\n+','',nextText)).strip()) == 0 + textPunc = len(re.sub(regex_pattern,'',re.sub(r'\n+','',text)).strip()) == 0 + if not EOS and (textPunc == True or ( len(nextText.strip()) >= 0 and nextPunc == True)): + lines[nextId] = f'{text}{nextText}' + continue + number_tags = re.compile(r'(⑥\d{6,}⑥)') + cleans_text = re.sub(number_tags, '' ,text) + cleans_text = re.sub(r'\d+', '' ,cleans_text) + cleans_text = self._cleans_text(cleans_text) + # fix:Langid's recognition of short sentences is inaccurate, and it is spliced longer. + if not EOS and len(cleans_text) <= 2: + lines[nextId] = f'{text}{nextText}' + continue + language,score = self._lang_classify(cleans_text) + prev_language , prev_text = self._get_prev_data(words) + if language != LANG_ZH and all('\u4e00' <= c <= '\u9fff' for c in re.sub(r'\s','',cleans_text)):language,score = LANG_ZH,1 + if len(cleans_text) <= 5 and self._is_chinese(cleans_text): + filters_string = self._get_filters_string() + if score < self.LangPriorityThreshold and len(filters_string) > 0: + index_ja , index_zh = filters_string.find(LANG_JA) , filters_string.find(LANG_ZH) + if index_ja != -1 and index_ja < index_zh:language = LANG_JA + elif index_zh != -1 and index_zh < index_ja:language = LANG_ZH + if self._is_japanese_kana(cleans_text):language = LANG_JA + elif len(cleans_text) > 2 and score > 0.90:pass + elif EOS and LANG_EOS:language = LANG_ZH if len(cleans_text) <= 1 else language + else: + LANG_UNKNOWN = LANG_ZH_JA if language == LANG_ZH or (len(cleans_text) <=2 and prev_language == LANG_ZH) else LANG_JA_ZH + match_end,match_char = self._match_ending(text, -1) + referen = prev_language in LANG_UNKNOWN or LANG_UNKNOWN in prev_language if prev_language else False + if match_char in "。.": language = prev_language if referen and len(words) > 0 else language + else:language = f"{LANG_UNKNOWN}|…" + text,*_ = re.subn(number_tags , self._restore_number , text ) + self._addwords(words,language,text,score) + + # ---------------------------------------------------------- + # 【SSML】中文数字处理:Chinese Number Processing (SSML support) + # 这里默认都是中文,用于处理 SSML 中文标签。当然可以支持任意语言,例如: + # The default here is Chinese, which is used to process SSML Chinese tags. Of course, any language can be supported, for example: + # 中文电话号码:1234567 + # 中文数字号码:1234567 + def _process_symbol_SSML(self, words,data): + tag , match = data + language = SSML = match[1] + text = match[2] + score = 1.0 + if SSML == "telephone": + # 中文-电话号码 + language = "zh" + text = self.LangSSML.to_chinese_telephone(text) + elif SSML == "number": + # 中文-数字读法 + language = "zh" + text = self.LangSSML.to_chinese_number(text) + elif SSML == "currency": + # 中文-按金额发音 + language = "zh" + text = self.LangSSML.to_chinese_currency(text) + elif SSML == "date": + # 中文-按金额发音 + language = "zh" + text = self.LangSSML.to_chinese_date(text) + self._addwords(words,language,text,score,SSML) + + # ---------------------------------------------------------- + def _restore_number(self, matche): + value = matche.group(0) + text_cache = self._text_cache + if value in text_cache: + process , data = text_cache[value] + tag , match = data + value = match + return value + + def _pattern_symbols(self, item , text): + if text is None:return text + tag , pattern , process = item + matches = pattern.findall(text) + if len(matches) == 1 and "".join(matches[0]) == text: + return text + for i , match in enumerate(matches): + key = f"⑥{tag}{i:06d}⑥" + text = re.sub(pattern , key , text , count=1) + self._text_cache[key] = (process , (tag , match)) + return text + + def _process_symbol(self, words,data): + tag , match = data + language = match[1] + text = match[2] + score = 1.0 + filters = self._get_filters_string() + if language not in filters: + self._process_symbol_SSML(words,data) + else: + self._addwords(words,language,text,score,True) + + def _process_english(self, words,data): + tag , match = data + text = match[0] + filters = self._get_filters_string() + priority_language = filters[:2] + # Preview feature, other language segmentation processing + enablePreview = self.EnablePreview + if enablePreview == True: + # Experimental: Other language support + regex_pattern = re.compile(r'(.*?[。.??!!]+[\n]{,1})') + lines = regex_pattern.split(text) + for index , text in enumerate(lines): + if len(text.strip()) == 0:continue + cleans_text = self._cleans_text(text) + language,score = self._lang_classify(cleans_text) + if language not in filters: + language,score = self._mean_processing(cleans_text) + if language is None or score <= 0.0:continue + elif language in filters:pass # pass + elif score >= 0.95:continue # High score, but not in the filter, excluded. + elif score <= 0.15 and filters[:2] == "fr":language = priority_language + else:language = "en" + self._addwords(words,language,text,score) + else: + # Default is English + language, score = "en", 1.0 + self._addwords(words,language,text,score) + + def _process_Russian(self, words,data): + tag , match = data + text = match[0] + language = "ru" + score = 1.0 + self._addwords(words,language,text,score) + + def _process_Thai(self, words,data): + tag , match = data + text = match[0] + language = "th" + score = 1.0 + self._addwords(words,language,text,score) + + def _process_korean(self, words,data): + tag , match = data + text = match[0] + language = "ko" + score = 1.0 + self._addwords(words,language,text,score) + + def _process_quotes(self, words,data): + tag , match = data + text = "".join(match) + childs = self.PARSE_TAG.findall(text) + if len(childs) > 0: + self._process_tags(words , text , False) + else: + cleans_text = self._cleans_text(match[1]) + if len(cleans_text) <= 5: + self._parse_language(words,text) + else: + language,score = self._lang_classify(cleans_text) + self._addwords(words,language,text,score) + + def _process_pinyin(self, words,data): + tag , match = data + text = match + language = "zh" + score = 1.0 + self._addwords(words,language,text,score) + + def _process_number(self, words,data): # "$0" process only + """ + Numbers alone cannot accurately identify language. + Because numbers are universal in all languages. + So it won't be executed here, just for testing. + """ + tag , match = data + language = words[0]["lang"] if len(words) > 0 else "zh" + text = match + score = 0.0 + self._addwords(words,language,text,score) + + def _process_tags(self, words , text , root_tag): + text_cache = self._text_cache + segments = re.split(self.PARSE_TAG, text) + segments_len = len(segments) - 1 + for index , text in enumerate(segments): + if root_tag:self._lang_eos = index >= segments_len + if self.PARSE_TAG.match(text): + process , data = text_cache[text] + if process:process(words , data) + else: + self._parse_language(words , text) + return words + + def _merge_results(self, words): + new_word = [] + for index , cur_data in enumerate(words): + if "symbol" in cur_data:del cur_data["symbol"] + if index == 0:new_word.append(cur_data) + else: + pre_data = new_word[-1] + if cur_data["lang"] == pre_data["lang"]: + pre_data["text"] = f'{pre_data["text"]}{cur_data["text"]}' + else:new_word.append(cur_data) + return new_word + + def _parse_symbols(self, text): + TAG_NUM = "00" # "00" => default channels , "$0" => testing channel + TAG_S1,TAG_S2,TAG_P1,TAG_P2,TAG_EN,TAG_KO,TAG_RU,TAG_TH = "$1" ,"$2" ,"$3" ,"$4" ,"$5" ,"$6" ,"$7","$8" + TAG_BASE = re.compile(fr'(([【《((“‘"\']*[LANGUAGE]+[\W\s]*)+)') + # Get custom language filter + filters = self.Langfilters + filters = filters if filters is not None else "" + # ======================================================================================================= + # Experimental: Other language support.Thử nghiệm: Hỗ trợ ngôn ngữ khác.Expérimental : prise en charge d’autres langues. + # 相关语言字符如有缺失,熟悉相关语言的朋友,可以提交把缺失的发音符号补全。 + # If relevant language characters are missing, friends who are familiar with the relevant languages can submit a submission to complete the missing pronunciation symbols. + # S'il manque des caractères linguistiques pertinents, les amis qui connaissent les langues concernées peuvent soumettre une soumission pour compléter les symboles de prononciation manquants. + # Nếu thiếu ký tự ngôn ngữ liên quan, những người bạn quen thuộc với ngôn ngữ liên quan có thể gửi bài để hoàn thành các ký hiệu phát âm còn thiếu. + # ------------------------------------------------------------------------------------------------------- + # Preview feature, other language support + enablePreview = self.EnablePreview + if "fr" in filters or \ + "vi" in filters:enablePreview = True + self.EnablePreview = enablePreview + # 实验性:法语字符支持。Prise en charge des caractères français + RE_FR = "" if not enablePreview else "àáâãäåæçèéêëìíîïðñòóôõöùúûüýþÿ" + # 实验性:越南语字符支持。Hỗ trợ ký tự tiếng Việt + RE_VI = "" if not enablePreview else "đơưăáàảãạắằẳẵặấầẩẫậéèẻẽẹếềểễệíìỉĩịóòỏõọốồổỗộớờởỡợúùủũụứừửữựôâêơưỷỹ" + # ------------------------------------------------------------------------------------------------------- + # Basic options: + process_list = [ + ( TAG_S1 , re.compile(self.SYMBOLS_PATTERN) , self._process_symbol ), # Symbol Tag + ( TAG_KO , re.compile(re.sub(r'LANGUAGE',f'\uac00-\ud7a3',TAG_BASE.pattern)) , self._process_korean ), # Korean words + ( TAG_TH , re.compile(re.sub(r'LANGUAGE',f'\u0E00-\u0E7F',TAG_BASE.pattern)) , self._process_Thai ), # Thai words support. + ( TAG_RU , re.compile(re.sub(r'LANGUAGE',f'А-Яа-яЁё',TAG_BASE.pattern)) , self._process_Russian ), # Russian words support. + ( TAG_NUM , re.compile(r'(\W*\d+\W+\d*\W*\d*)') , self._process_number ), # Number words, Universal in all languages, Ignore it. + ( TAG_EN , re.compile(re.sub(r'LANGUAGE',f'a-zA-Z{RE_FR}{RE_VI}',TAG_BASE.pattern)) , self._process_english ), # English words + Other language support. + ( TAG_P1 , re.compile(r'(["\'])(.*?)(\1)') , self._process_quotes ), # Regular quotes + ( TAG_P2 , re.compile(r'([\n]*[【《((“‘])([^【《((“‘’”))》】]{3,})([’”))》】][\W\s]*[\n]{,1})') , self._process_quotes ), # Special quotes, There are left and right. + ] + # Extended options: Default False + if self.keepPinyin == True:process_list.insert(1 , + ( TAG_S2 , re.compile(r'([\(({](?:\s*\w*\d\w*\s*)+[})\)])') , self._process_pinyin ), # Chinese Pinyin Tag. + ) + # ------------------------------------------------------------------------------------------------------- + words = [] + lines = re.findall(r'.*\n*', re.sub(self.PARSE_TAG, '' ,text)) + for index , text in enumerate(lines): + if len(text.strip()) == 0:continue + self._lang_eos = False + self._text_cache = {} + for item in process_list: + text = self._pattern_symbols(item , text) + cur_word = self._process_tags([] , text , True) + if len(cur_word) == 0:continue + cur_data = cur_word[0] if len(cur_word) > 0 else None + pre_data = words[-1] if len(words) > 0 else None + if cur_data and pre_data and cur_data["lang"] == pre_data["lang"] \ + and cur_data["symbol"] == False and pre_data["symbol"] : + cur_data["text"] = f'{pre_data["text"]}{cur_data["text"]}' + words.pop() + words += cur_word + if self.isLangMerge == True:words = self._merge_results(words) + lang_count = self._lang_count + if lang_count and len(lang_count) > 0: + lang_count = dict(sorted(lang_count.items(), key=lambda x: x[1], reverse=True)) + lang_count = list(lang_count.items()) + self._lang_count = lang_count + return words + + def setfilters(self, filters): + # 当过滤器更改时,清除缓存 + # 필터가 변경되면 캐시를 지웁니다. + # フィルタが変更されると、キャッシュがクリアされます + # When the filter changes, clear the cache + if self.Langfilters != filters: + self._clears() + self.Langfilters = filters + + def getfilters(self): + return self.Langfilters + + def setPriorityThreshold(self, threshold:float): + self.LangPriorityThreshold = threshold + + def getPriorityThreshold(self): + return self.LangPriorityThreshold + + def getCounts(self): + lang_count = self._lang_count + if lang_count is not None:return lang_count + text_langs = self._text_langs + if text_langs is None or len(text_langs) == 0:return [("zh",0)] + lang_counts = defaultdict(int) + for d in text_langs:lang_counts[d['lang']] += int(len(d['text'])*2) if d['lang'] == "zh" else len(d['text']) + lang_counts = dict(sorted(lang_counts.items(), key=lambda x: x[1], reverse=True)) + lang_counts = list(lang_counts.items()) + self._lang_count = lang_counts + return lang_counts + + def getTexts(self, text:str): + if text is None or len(text.strip()) == 0: + self._clears() + return [] + # lasts + text_langs = self._text_langs + if self._text_lasts == text and text_langs is not None:return text_langs + # parse + self._text_waits = [] + self._lang_count = None + self._text_lasts = text + text = self._parse_symbols(text) + self._text_langs = text + return text + + def classify(self, text:str): + return self.getTexts(text) + +def printList(langlist): + """ + 功能:打印数组结果 + 기능: 어레이 결과 인쇄 + 機能:配列結果を印刷 + Function: Print array results + """ + print("\n===================【打印结果】===================") + if langlist is None or len(langlist) == 0: + print("无内容结果,No content result") + return + for line in langlist: + print(line) + pass + + + +def main(): + + # ----------------------------------- + # 更新日志:新版本分词更加精准。 + # Changelog: The new version of the word segmentation is more accurate. + # チェンジログ:新しいバージョンの単語セグメンテーションはより正確です。 + # Changelog: 분할이라는 단어의 새로운 버전이 더 정확합니다. + # ----------------------------------- + + # 输入示例1:(包含日文,中文)Input Example 1: (including Japanese, Chinese) + # text = "“昨日は雨が降った,音楽、映画。。。”你今天学习日语了吗?春は桜の季節です。语种分词是语音合成必不可少的环节。言語分詞は音声合成に欠かせない環節である!" + + # 输入示例2:(包含日文,中文)Input Example 1: (including Japanese, Chinese) + # text = "欢迎来玩。東京,は日本の首都です。欢迎来玩. 太好了!" + + # 输入示例3:(包含日文,中文)Input Example 1: (including Japanese, Chinese) + # text = "明日、私たちは海辺にバカンスに行きます。你会说日语吗:“中国語、話せますか” 你的日语真好啊!" + + + # 输入示例4:(包含日文,中文,韩语,英文)Input Example 4: (including Japanese, Chinese, Korean, English) + # text = "你的名字叫佐々木?吗?韩语中的안녕 오빠读什么呢?あなたの体育の先生は誰ですか? 此次发布会带来了四款iPhone 15系列机型和三款Apple Watch等一系列新品,这次的iPad Air采用了LCD屏幕" + + + # 试验性支持:"fr"法语 , "vi"越南语 , "ru"俄语 , "th"泰语。Experimental: Other language support. + langsegment = LangSegment() + langsegment.setfilters(["fr", "vi" , "ja", "zh", "ko", "en" , "ru" , "th"]) + text = """ +我喜欢在雨天里听音乐。 +I enjoy listening to music on rainy days. +雨の日に音楽を聴くのが好きです。 +비 오는 날에 음악을 듣는 것을 즐깁니다。 +J'aime écouter de la musique les jours de pluie. +Tôi thích nghe nhạc vào những ngày mưa. +Мне нравится слушать музыку в дождливую погоду. +ฉันชอบฟังเพลงในวันที่ฝนตก +""" + + + + # 进行分词:(接入TTS项目仅需一行代码调用)Segmentation: (Only one line of code is required to access the TTS project) + langlist = langsegment.getTexts(text) + printList(langlist) + + + # 语种统计:Language statistics: + print("\n===================【语种统计】===================") + # 获取所有语种数组结果,根据内容字数降序排列 + # Get the array results in all languages, sorted in descending order according to the number of content words + langCounts = langsegment.getCounts() + print(langCounts , "\n") + + # 根据结果获取内容的主要语种 (语言,字数含标点) + # Get the main language of content based on the results (language, word count including punctuation) + lang , count = langCounts[0] + print(f"输入内容的主要语言为 = {lang} ,字数 = {count}") + print("==================================================\n") + + + # 分词输出:lang=语言,text=内容。Word output: lang = language, text = content + # ===================【打印结果】=================== + # {'lang': 'zh', 'text': '你的名字叫'} + # {'lang': 'ja', 'text': '佐々木?'} + # {'lang': 'zh', 'text': '吗?韩语中的'} + # {'lang': 'ko', 'text': '안녕 오빠'} + # {'lang': 'zh', 'text': '读什么呢?'} + # {'lang': 'ja', 'text': 'あなたの体育の先生は誰ですか?'} + # {'lang': 'zh', 'text': ' 此次发布会带来了四款'} + # {'lang': 'en', 'text': 'i Phone '} + # {'lang': 'zh', 'text': '15系列机型和三款'} + # {'lang': 'en', 'text': 'Apple Watch '} + # {'lang': 'zh', 'text': '等一系列新品,这次的'} + # {'lang': 'en', 'text': 'i Pad Air '} + # {'lang': 'zh', 'text': '采用了'} + # {'lang': 'en', 'text': 'L C D '} + # {'lang': 'zh', 'text': '屏幕'} + # ===================【语种统计】=================== + + # ===================【语种统计】=================== + # [('zh', 51), ('ja', 19), ('en', 18), ('ko', 5)] + + # 输入内容的主要语言为 = zh ,字数 = 51 + # ================================================== + # The main language of the input content is = zh, word count = 51 + + +if __name__ == "__main__": + main() diff --git a/language_segmentation/__init__.py b/language_segmentation/__init__.py new file mode 100644 index 0000000..75b3bf6 --- /dev/null +++ b/language_segmentation/__init__.py @@ -0,0 +1,9 @@ +from .LangSegment import LangSegment + + +# release +__version__ = '0.3.5' + + +# develop +__develop__ = 'dev-0.0.1' \ No newline at end of file diff --git a/language_segmentation/utils/__init__.py b/language_segmentation/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/language_segmentation/utils/num.py b/language_segmentation/utils/num.py new file mode 100644 index 0000000..05a5f70 --- /dev/null +++ b/language_segmentation/utils/num.py @@ -0,0 +1,327 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Digital processing from GPT_SoVITS num.py (thanks) +""" +Rules to verbalize numbers into Chinese characters. +https://zh.wikipedia.org/wiki/中文数字#現代中文 +""" + +import re +from collections import OrderedDict +from typing import List + +DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')} +UNITS = OrderedDict({ + 1: '十', + 2: '百', + 3: '千', + 4: '万', + 8: '亿', +}) + +COM_QUANTIFIERS = '(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)' + +# 分数表达式 +RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') + + +def replace_frac(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + sign = match.group(1) + nominator = match.group(2) + denominator = match.group(3) + sign: str = "负" if sign else "" + nominator: str = num2str(nominator) + denominator: str = num2str(denominator) + result = f"{sign}{denominator}分之{nominator}" + return result + + +# 百分数表达式 +RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%') + + +def replace_percentage(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + sign = match.group(1) + percent = match.group(2) + sign: str = "负" if sign else "" + percent: str = num2str(percent) + result = f"{sign}百分之{percent}" + return result + + +# 整数表达式 +# 带负号的整数 -10 +RE_INTEGER = re.compile(r'(-)' r'(\d+)') + + +def replace_negative_num(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + sign = match.group(1) + number = match.group(2) + sign: str = "负" if sign else "" + number: str = num2str(number) + result = f"{sign}{number}" + return result + + +# 编号-无符号整形 +# 00078 +RE_DEFAULT_NUM = re.compile(r'\d{3}\d*') + + +def replace_default_num(match): + """ + Args: + match (re.Match) + Returns: + str + """ + number = match.group(0) + return verbalize_digit(number, alt_one=True) + + +# 加减乘除 +# RE_ASMD = re.compile( +# r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))') +RE_ASMD = re.compile( + r'((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))') + +asmd_map = { + '+': '加', + '-': '减', + '×': '乘', + '÷': '除', + '=': '等于' +} + +def replace_asmd(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + result = match.group(1) + asmd_map[match.group(8)] + match.group(9) + return result + + +# 次方专项 +RE_POWER = re.compile(r'[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+') + +power_map = { + '⁰': '0', + '¹': '1', + '²': '2', + '³': '3', + '⁴': '4', + '⁵': '5', + '⁶': '6', + '⁷': '7', + '⁸': '8', + '⁹': '9', + 'ˣ': 'x', + 'ʸ': 'y', + 'ⁿ': 'n' +} + +def replace_power(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + power_num = "" + for m in match.group(0): + power_num += power_map[m] + result = "的" + power_num + "次方" + return result + + +# 数字表达式 +# 纯小数 +RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))') +# 正整数 + 量词 +RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS) +RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))') + + +def replace_positive_quantifier(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + number = match.group(1) + match_2 = match.group(2) + if match_2 == "+": + match_2 = "多" + match_2: str = match_2 if match_2 else "" + quantifiers: str = match.group(3) + number: str = num2str(number) + result = f"{number}{match_2}{quantifiers}" + return result + + +def replace_number(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + sign = match.group(1) + number = match.group(2) + pure_decimal = match.group(5) + if pure_decimal: + result = num2str(pure_decimal) + else: + sign: str = "负" if sign else "" + number: str = num2str(number) + result = f"{sign}{number}" + return result + + +# 范围表达式 +# match.group(1) and match.group(8) are copy from RE_NUMBER + +RE_RANGE = re.compile( + r""" + (? str: + """ + Args: + match (re.Match) + Returns: + str + """ + first, second = match.group(1), match.group(6) + first = RE_NUMBER.sub(replace_number, first) + second = RE_NUMBER.sub(replace_number, second) + result = f"{first}到{second}" + return result + + +# ~至表达式 +RE_TO_RANGE = re.compile( + r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)') + +def replace_to_range(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + result = match.group(0).replace('~', '至') + return result + + +def _get_value(value_string: str, use_zero: bool=True) -> List[str]: + stripped = value_string.lstrip('0') + if len(stripped) == 0: + return [] + elif len(stripped) == 1: + if use_zero and len(stripped) < len(value_string): + return [DIGITS['0'], DIGITS[stripped]] + else: + return [DIGITS[stripped]] + else: + largest_unit = next( + power for power in reversed(UNITS.keys()) if power < len(stripped)) + first_part = value_string[:-largest_unit] + second_part = value_string[-largest_unit:] + return _get_value(first_part) + [UNITS[largest_unit]] + _get_value( + second_part) + + +def verbalize_cardinal(value_string: str) -> str: + if not value_string: + return '' + + # 000 -> '零' , 0 -> '零' + value_string = value_string.lstrip('0') + if len(value_string) == 0: + return DIGITS['0'] + + result_symbols = _get_value(value_string) + # verbalized number starting with '一十*' is abbreviated as `十*` + if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[ + '1'] and result_symbols[1] == UNITS[1]: + result_symbols = result_symbols[1:] + return ''.join(result_symbols) + + +def verbalize_digit(value_string: str, alt_one=False) -> str: + result_symbols = [DIGITS[digit] for digit in value_string] + result = ''.join(result_symbols) + if alt_one: + result = result.replace("一", "幺") + return result + + +def num2str(value_string: str) -> str: + integer_decimal = value_string.split('.') + if len(integer_decimal) == 1: + integer = integer_decimal[0] + decimal = '' + elif len(integer_decimal) == 2: + integer, decimal = integer_decimal + else: + raise ValueError( + f"The value string: '${value_string}' has more than one point in it." + ) + + result = verbalize_cardinal(integer) + + decimal = decimal.rstrip('0') + if decimal: + # '.22' is verbalized as '零点二二' + # '3.20' is verbalized as '三点二 + result = result if result else "零" + result += '点' + verbalize_digit(decimal) + return result + + +if __name__ == "__main__": + + text = "" + text = num2str(text) + print(text) + pass \ No newline at end of file diff --git a/lyric_processor_v2.py b/lyric_processor_v2.py new file mode 100644 index 0000000..fa07333 --- /dev/null +++ b/lyric_processor_v2.py @@ -0,0 +1,412 @@ +import torch.nn as nn +import torch +import random +from loguru import logger + +from transformers import UMT5EncoderModel, AutoTokenizer, AutoModel +import re +from typing import List, Tuple, Dict, Set +import sys +import os + + +os.environ["TOKENIZERS_PARALLELISM"] = "false" + + +class TrieNode: + def __init__(self): + self.children: Dict[str, 'TrieNode'] = {} + self.is_end_of_word: bool = False + + +class Trie: + def __init__(self): + self.root = TrieNode() + + def insert(self, word: str): + node = self.root + for char in word: + if char not in node.children: + node.children[char] = TrieNode() + node = node.children[char] + node.is_end_of_word = True + + def search_from(self, s: str, start: int) -> List[str]: + """ + 从字符串 s 的位置 start 开始,使用 Trie 树查找所有可能的匹配 phoneme。 + 返回所有匹配的 phoneme。 + """ + node = self.root + matches = [] + current_phoneme = [] + for i in range(start, len(s)): + char = s[i] + if char in node.children: + node = node.children[char] + current_phoneme.append(char) + if node.is_end_of_word: + matches.append(''.join(current_phoneme)) + else: + break + return matches + + +class PhonemeMatcher: + def __init__(self, word_dict: Set[str]): + """ + 初始化 PhonemeMatcher,构建 Trie 树。 + + :param word_dict: Set[str] - 包含所有 phoneme 的集合 + """ + self.trie = Trie() + for word in word_dict: + self.trie.insert(word) + + def tokenize(self, s: str) -> List[str]: + """ + 将输入的 xsampa 字符串拆分成 phoneme 序列,尽可能使用词表中的 phoneme, + 并在无法完全匹配时,选择编辑距离最小且 phoneme 数量最少的序列。 + + :param s: str - 输入的 xsampa 字符串 + :return: List[str] - 输出的 phoneme 序列 + """ + n = len(s) + # 初始化 DP 数组,dp[i] = (cost, phoneme_count, phone_list) + dp: List[Tuple[int, int, List[str]]] = [(sys.maxsize, sys.maxsize, []) for _ in range(n + 1)] + dp[0] = (0, 0, []) + + for i in range(n): + current_cost, current_count, current_list = dp[i] + if current_cost == sys.maxsize: + continue # 无法到达当前位置 + + # 查找所有从位置 i 开始的匹配 phoneme + matches = self.trie.search_from(s, i) + + if matches: + for phoneme in matches: + end = i + len(phoneme) + new_cost = current_cost # 匹配成功,无需增加编辑距离 + new_count = current_count + 1 + new_list = current_list + [phoneme] + + if new_cost < dp[end][0]: + dp[end] = (new_cost, new_count, new_list) + elif new_cost == dp[end][0]: + if new_count < dp[end][1]: + dp[end] = (new_cost, new_count, new_list) + else: + # 没有匹配的 phoneme,考虑跳过当前字符,增加编辑距离 + new_cost = current_cost + 1 + end = i + 1 + new_count = current_count + 1 # 跳过一个字符也算作一个 phoneme + new_list = current_list + [s[i]] + + if new_cost < dp[end][0]: + dp[end] = (new_cost, new_count, new_list) + elif new_cost == dp[end][0]: + if new_count < dp[end][1]: + dp[end] = (new_cost, new_count, new_list) + + # 如果无法完全匹配,选择最优的近似匹配 + if dp[n][0] == sys.maxsize: + # 找到所有可能的最小编辑距离 + min_cost = min(dp[i][0] for i in range(n + 1)) + # 选择最小编辑距离且 phoneme 数量最少的序列 + candidates = [dp[i] for i in range(n + 1) if dp[i][0] == min_cost] + if candidates: + # 选择 phoneme 数量最少的 + best = min(candidates, key=lambda x: x[1]) + return best[2] + else: + return [] + + return dp[n][2] + + +HARMONIX_LABELS = [ + 'start', + 'end', + 'intro', + 'outro', + 'break', + 'bridge', + 'inst', + 'solo', + 'verse', + 'chorus', +] + + +def timestamp2second(timestamps): + res = [] + for item in timestamps: + start, end = item["start"], item["end"] + # convert 8kHz to latents level + start = round(start / 8000, 2) + end = round(end / 8000, 2) + res.append({"start": start, "end": end}) + return res + + +def sample_lyric_mask(voiced_timestamp, max_length): + voiced_timestamps = timestamp2second(voiced_timestamp) + + min_gaps = [1,2,3,4,5] + while len(min_gaps) > 0: + min_gap = min_gaps.pop() + can_split_breaks = [] + last_end = 0.00 + for item in voiced_timestamps: + if item["start"] - last_end >= min_gap: + if last_end == 0.00: + can_split_breaks.append((last_end, item["start"] - 0.5)) + else: + can_split_breaks.append((last_end + 0.5, item["start"] - 0.5)) + last_end = item["end"] + if len(can_split_breaks) > 1: + if can_split_breaks[1][0] <= 360: + break + else: + if min_gap == 1: + return 0.0, 360.0, 36 + + if len(can_split_breaks) == 0: + mask_start, mask_end = 0.0, max_length + min_cut_level = int(mask_end//10 - mask_start//10 + 1) + return 0.0, mask_end, min_cut_level + + if len(can_split_breaks) == 1: + # 前后随机选一个 + mask_start = random.choice(["start", "middle"]) + if mask_start == "start": + mask_start = 0.0 + mask_end = random.uniform(can_split_breaks[0][0], can_split_breaks[0][1]) + else: + mask_start = random.uniform(can_split_breaks[0][0], can_split_breaks[0][1]) + mask_end = max_length + min_cut_level = int(mask_end//10 - mask_start//10 + 1) + return mask_start, mask_end, min_cut_level + + mask_start, mask_end = 0.0, 370 + min_cut_level = 37 + breaths_gap = [end-start for start, end in can_split_breaks] + max_tried = 5 + while mask_end - mask_start > 370 and min_cut_level > 0 and min_cut_level > 36: + total_breaths = len(can_split_breaks) + start = random.choices(range(total_breaths-1), weights=breaths_gap[:-1])[0] + end = random.choices(range(start + 1, total_breaths), weights=breaths_gap[start+1:], k=1)[0] + start_break, end_break = can_split_breaks[start], can_split_breaks[end] + mask_start, mask_end = random.uniform(start_break[0], start_break[1]), random.uniform(end_break[0], end_break[1]) + min_cut_level = int(mask_end//10 - mask_start//10 + 1) + if min_cut_level < 36: + min_cut_level = random.randint(min_cut_level, 36) + if max_tried == 0: + print("max tried", mask_start, mask_end, min_cut_level, "breaths_gap", breaths_gap, "can_split_breaks", can_split_breaks) + break + max_tried -= 1 + mask_start, mask_end = round(mask_start, 2), min(round(mask_end, 2), max_length) + return mask_start, mask_end, min_cut_level + + +def check_valid_lyric_lines(lyric_lines): + # must has lyric lines + if len(lyric_lines) == 0: + return False + for valid_lyric_line in lyric_lines: + if len(valid_lyric_line[1]) > 0: + return True + return False + + +def select_valid_lyric_lines(lyric_lines, mask_start, mask_end): + # 选歌词原则 + # 宁可多,不可少 + # 选取mask_start和mask_end之间的歌词行,如果mask_end在一个歌词行中间,那么这个歌词行也要被选取,但最后的structure不要 + valid_lyric_lines = [] + add_tail_structure = True + for lyric_line in lyric_lines: + if lyric_line["start"] > lyric_line["end"]: + continue + if lyric_line["start"]+1.0 >= mask_start and lyric_line["end"]-1.0 <= mask_end: + if len(valid_lyric_lines) > 0: + if valid_lyric_lines[-1][0] is not None and valid_lyric_lines[-1][0] != lyric_line["structure"] and lyric_line["structure"] != "": + valid_lyric_lines.append((lyric_line["structure"], [], [], (lyric_line["start"], lyric_line["end"]))) + elif lyric_line["structure"] != "": + valid_lyric_lines.append((lyric_line["structure"], [], [], (lyric_line["start"], lyric_line["end"]))) + lyric_line["lyric_line"] = lyric_line["lyric_line"].strip() + if lyric_line["lyric_line"] and "phoneme_line_ipa" in lyric_line and len(lyric_line["phoneme_line_ipa"]) > 0: + valid_lyric_lines.append((None, lyric_line["lyric_line"], lyric_line["phoneme_line_ipa"], (lyric_line["start"], lyric_line["end"]))) + elif mask_start < lyric_line["start"] and lyric_line["start"] < mask_end and lyric_line["end"] > mask_end: + lyric_line["lyric_line"] = lyric_line["lyric_line"].strip() + if lyric_line["lyric_line"] and "phoneme_line_ipa" in lyric_line and len(lyric_line["phoneme_line_ipa"]) > 0: + valid_lyric_lines.append((None, lyric_line["lyric_line"], lyric_line["phoneme_line_ipa"], (lyric_line["start"], lyric_line["end"]))) + add_tail_structure = False + break + elif lyric_line["start"] > mask_start and lyric_line["start"] < mask_end and not lyric_line["lyric_line"] and add_tail_structure: + valid_lyric_lines.append((lyric_line["structure"], [], [], (lyric_line["start"], lyric_line["end"]))) + add_tail_structure = False + break + if len(valid_lyric_lines) > 0 and len(lyric_lines) > 0 and add_tail_structure: + if lyric_lines[-1]["structure"] != "" and lyric_lines[-1]["structure"] != valid_lyric_lines[-1][0]: + if lyric_lines[-1]["start"] > mask_start and lyric_lines[-1]["start"] < mask_end: + valid_lyric_lines.append((lyric_lines[-1]["structure"], [], [], (lyric_lines[-1]["start"], lyric_lines[-1]["end"]))) + return valid_lyric_lines + + +def sample_lyric_mask_with_cut_levels(voiced_timestamp, cut_level, n_chunks, lyric_lines): + voiced_timestamps = timestamp2second(voiced_timestamp) + + candidate_spans = [] + for candidate_start_idx in range(n_chunks): + candidate_start_second = candidate_start_idx * 10 + candidate_end_second = (candidate_start_idx + cut_level) * 10 + valid = True + for item in voiced_timestamps: + if item["start"] < candidate_start_second and candidate_start_second < item["end"]: + valid = False + break + if item["start"] < candidate_end_second and candidate_end_second < item["end"]: + valid = False + break + valid_lyric_lines = select_valid_lyric_lines(lyric_lines, candidate_start_second, candidate_end_second) + if not check_valid_lyric_lines(valid_lyric_lines): + valid = False + if valid: + candidate_spans.append((candidate_start_second, candidate_end_second, valid_lyric_lines)) + + if len(candidate_spans) > 0: + return candidate_spans + else: + candidate_spans = [] + for candidate_start_idx in range(n_chunks): + candidate_start_second = candidate_start_idx * 10 + candidate_end_second = (candidate_start_idx + cut_level) * 10 + valid_lyric_lines = select_valid_lyric_lines(lyric_lines, candidate_start_second, candidate_end_second) + if check_valid_lyric_lines(valid_lyric_lines): + candidate_spans.append((candidate_start_second, candidate_end_second, valid_lyric_lines)) + if len(candidate_spans) > 0: + return candidate_spans + return [] + + +def sample_lyric_mask_with_lyric_timestamp(cut_level, lyric_lines, expected_num_example, n_chunks, start_pad_offset=1.0): + # 1 去掉structure + # non_structure_lyric_lines = [lyric_line for lyric_line in lyric_lines if lyric_line["lyric_line"] and "phoneme_line_ipa" in lyric_line and len(lyric_line["phoneme_line_ipa"]) > 0 and lyric_line["start"] < lyric_line["end"]] + # 保留structure + valid_lyric_lines = [] + last_structure = "" + for lyric_line in lyric_lines: + if "structure" not in lyric_line: + lyric_line["structure"] = "" + if lyric_line["start"] < lyric_line["end"]: + new_line = lyric_line.copy() + if not lyric_line["lyric_line"] or "phoneme_line_ipa" not in lyric_line or len(lyric_line["phoneme_line_ipa"]) == 0: + if lyric_line["structure"] != "": + new_line["lyric_line"] = "["+lyric_line["structure"]+"]" + new_line["phoneme_line_ipa"] = ["_"] + else: + last_structure = lyric_line["structure"] + continue + else: + if new_line["structure"] != "" and new_line["structure"] != last_structure: + if new_line["lyric_line"] != "[" + new_line["structure"] + "]": + new_line["lyric_line"] = f"[{new_line['structure']}]\n{new_line['lyric_line']}" + new_line["phoneme_line_ipa"] = ["_", "_"] + new_line["phoneme_line_ipa"] + + valid_lyric_lines.append(new_line) + last_structure = lyric_line["structure"] + + # 2 优先选刚好包含在里面的 + full_spans = [] + partial_spans = [] + # print("non_structure_lyric_lines", non_structure_lyric_lines, n_chunks) + for start_idx in range(len(valid_lyric_lines)): + for end_idx in range(start_idx, len(valid_lyric_lines)): + start = valid_lyric_lines[start_idx]["start"] + end = start + cut_level * 10 + + # print("start_idx:", start_idx, "end_idx:", end_idx, "start:", start, "end:", end, "non_structure_lyric_lines[end_idx]:", non_structure_lyric_lines[end_idx]) + + if start_idx == end_idx and valid_lyric_lines[start_idx]["end"] > end: + res = [(None, valid_lyric_lines[start_idx]["lyric_line"], valid_lyric_lines[start_idx]["phoneme_line_ipa"], (valid_lyric_lines[start_idx]["start"], valid_lyric_lines[start_idx]["end"])) for line_idx in range(start_idx, end_idx+1)] + if len(res) > 0: + partial_spans.append((start, end, res)) + break + + if end_idx > 0 and end < valid_lyric_lines[end_idx]["start"] and valid_lyric_lines[end_idx-1]["end"] + start_pad_offset < end: + res = [(None, valid_lyric_lines[line_idx]["lyric_line"], valid_lyric_lines[line_idx]["phoneme_line_ipa"], (valid_lyric_lines[line_idx]["start"], valid_lyric_lines[line_idx]["end"])) for line_idx in range(start_idx, end_idx)] + if len(res) > 0: + full_spans.append((start, end, res)) + break + + if end < valid_lyric_lines[end_idx]["end"] + start_pad_offset and end > valid_lyric_lines[end_idx]["start"]: + res = [(None, valid_lyric_lines[line_idx]["lyric_line"], valid_lyric_lines[line_idx]["phoneme_line_ipa"], (valid_lyric_lines[line_idx]["start"], valid_lyric_lines[line_idx]["end"])) for line_idx in range(start_idx, end_idx)] + if len(res) > 0: + partial_spans.append((start, end, res)) + break + + if valid_lyric_lines[end_idx]["start"] > end: + break + + if start_idx == 0 and end_idx == len(valid_lyric_lines) - 1 and len(full_spans) == 0 and len(partial_spans) == 0: + res = [(None, valid_lyric_lines[line_idx]["lyric_line"], valid_lyric_lines[line_idx]["phoneme_line_ipa"], (valid_lyric_lines[line_idx]["start"], valid_lyric_lines[line_idx]["end"])) for line_idx in range(start_idx, end_idx+1)] + if len(res) > 0: + full_spans.append((start, end, res)) + if expected_num_example is not None: + if len(full_spans) >= expected_num_example or len(partial_spans) == 0: + return full_spans + if len(full_spans) + len(partial_spans) >= expected_num_example: + left = expected_num_example - len(full_spans) + return full_spans + random.sample(partial_spans, left) + # print("full_spans:", full_spans) + # print("partial_spans:", partial_spans) + return full_spans + partial_spans + + +class LyricProcessor(nn.Module): + def __init__(self, infer=False): + super().__init__() + self.lyric_text_model = UMT5EncoderModel.from_pretrained("./checkpoints/umt5-base", local_files_only=True).eval().half() + # not required gradient + self.lyric_text_model.requires_grad_(False) + self.lyric_text_tokenizer = AutoTokenizer.from_pretrained("./checkpoints/umt5-base", local_files_only=True) + + + def get_text_embeddings(self, texts, device, text_max_length=256): + inputs = self.lyric_text_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=text_max_length) + inputs = {key: value.to(device) for key, value in inputs.items()} + if self.lyric_text_model.device != device: + self.lyric_text_model.to(device) + with torch.no_grad(): + outputs = self.lyric_text_model(**inputs) + last_hidden_states = outputs.last_hidden_state + attention_mask = inputs["attention_mask"] + return last_hidden_states, attention_mask + + def preprocess(self, valid_lyric_lines): + lyric_texts = [] + ipa_texts = [] + for valid_line in valid_lyric_lines: + structure, lyric_line, ipa_line = valid_line["structure"], valid_line["lyric"], valid_line["ipa"] + if len(structure) > 0: + lyric_texts.append(structure) + if len(lyric_line) > 0: + lyric_texts.append(lyric_line) + if len(structure) == 0 and len(lyric_line) == 0: + lyric_texts.append("") + + if ipa_line != "_": + ipa_line = self.split_unk(ipa_line.split(" ")) + ipa_line_str = " ".join(ipa_line) + # 处理掉G2P的bug + ipa_line_str = re.sub(r'\bz(?:\s+ə\s+z)+\b', "", ipa_line_str) + ipa_line_str = re.sub(r'\s+', ' ', ipa_line_str).strip() + ipa_texts.append(ipa_line_str) + else: + ipa_texts.append(ipa_line) + + lyric_text = "\n".join(lyric_texts) + ipa_text = " _ ".join(ipa_texts) + return lyric_text, ipa_text + diff --git a/main_text2music_large_sana_dcae_0331_finetune.py b/main_text2music_large_sana_dcae_0331_finetune.py new file mode 100644 index 0000000..4c7e583 --- /dev/null +++ b/main_text2music_large_sana_dcae_0331_finetune.py @@ -0,0 +1,644 @@ +import json + +import matplotlib +import torch +import torch.nn.functional as F +import torch.utils.data +from pytorch_lightning.core import LightningModule +from torch.utils.data import DataLoader + +# from diffusers.schedulers import FlowMatchEulerDiscreteScheduler +from schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler + +from models.transformer_sana_text2music_large_dcae_0319 import ACEFlowBaseModel +from loguru import logger +from transformers import AutoModel +from lyric_processor_v2 import LyricProcessor +from optimizers.cosine_wsd import configure_lr_scheduler +import traceback +import torchaudio +from transformers import Wav2Vec2FeatureExtractor +from music_dcae.music_dcae_pipeline import MusicDCAE +from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import retrieve_timesteps +from diffusers.utils.torch_utils import randn_tensor +from apg_guidance import apg_forward, MomentumBuffer +from tqdm import tqdm +import random +import os + + +matplotlib.use("Agg") +torch.backends.cudnn.benchmark = False +torch.set_float32_matmul_precision('high') + +# Enable TF32 for faster training on Ampere GPUs, +# cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices +# torch.backends.cuda.matmul.allow_tf32 = True + + +class Pipeline(LightningModule): + def __init__( + self, + learning_rate: float = 1e-4, + num_workers: int = 4, + infer: bool = False, + train: bool = True, + T: int = 1000, + minibatch_size: int = 32, + batch_size: int = 1, + snr_gamma: float = 0.5, + prediction_type: str = "v_prediction", # epsilon, sample, v_prediction + beta_start: float = 0.0015, + beta_end: float = 0.0195, + noise_offset: float = 0.1, + input_perturbation: float = 0.1, + use_ema: bool = False, + enable_xformers_memory_efficient_attention: bool = False, + weight_decay: float = 1e-2, + num_chunk: int = 2, + beta_schedule: str = "scaled_linear", + scheduler_type: str = "ddpm", + every_plot_step: int = 2000, + vocal_noise: float = 0, + max_length: int = 6400, + sample_size: int = None, + target_orig: bool = True, + csv_path: str = None, + config_path: str = "./models/config_sana_text2music_dcae_0225_3.5B_simple.json", + shift: float = 3.0, + logit_mean: float = 0.0, + logit_std: float = 1.0, + timestep_densities_type: str = "logit_normal", + ssl_coeff: float = 1.0, + wav_max_seconds: float = 30.0, + max_steps: int = -1, + fix_cut_level: int = 3, + ipa_max_length: int = 8192, + text_max_length: int = 1024, + ): + super().__init__() + + self.save_hyperparameters() + self.is_train = train + self.T = T + self.beta_start = beta_start + self.beta_end = beta_end + + self.scheduler = self.get_scheduler() + with open(config_path, "r") as f: + self.config = json.load(f) + self.transformers = ACEFlowBaseModel(**self.config) + + self.lyric_processor = LyricProcessor() + self.lyric_processor.requires_grad_(False) + + if not infer and self.is_train: + self.mert_model = AutoModel.from_pretrained("./checkpoints/MERT-v1-330M", trust_remote_code=True).eval() + self.mert_model.requires_grad_(False) + self.resampler_mert = torchaudio.transforms.Resample(orig_freq=48000, new_freq=24000) + self.processor_mert = Wav2Vec2FeatureExtractor.from_pretrained("./checkpoints/MERT-v1-330M", trust_remote_code=True) + + self.hubert_model = AutoModel.from_pretrained("checkpoints/mHuBERT-147", local_files_only=True).eval() + self.hubert_model.requires_grad_(False) + self.resampler_mhubert = torchaudio.transforms.Resample(orig_freq=48000, new_freq=16000) + self.processor_mhubert = Wav2Vec2FeatureExtractor.from_pretrained("checkpoints/mHuBERT-147", local_files_only=True) + + self.ssl_coeff = ssl_coeff + + self.vae = MusicDCAE(encoder_only=False).eval() + self.vae.requires_grad_(False) + + # self.mert_model = torch.compile(self.mert_model) + # self.hubert_model = torch.compile(self.hubert_model) + # self.vae = torch.compile(self.vae) + # self.transformers = torch.compile(self.transformers) + else: + self.vae = MusicDCAE(encoder_only=False).eval() + self.vae.requires_grad_(False) + + def infer_mert_ssl(self, target_wavs, wav_lengths): + # 输入为 N x 2 x T (48kHz),转换为 N x T (24kHz),单声道 + mert_input_wavs_mono_24k = self.resampler_mert(target_wavs.mean(dim=1)) + bsz = target_wavs.shape[0] + actual_lengths_24k = wav_lengths // 2 # 48kHz -> 24kHz + + # 对实际音频部分进行归一化 + means = torch.stack([mert_input_wavs_mono_24k[i, :actual_lengths_24k[i]].mean() for i in range(bsz)]) + vars = torch.stack([mert_input_wavs_mono_24k[i, :actual_lengths_24k[i]].var() for i in range(bsz)]) + mert_input_wavs_mono_24k = (mert_input_wavs_mono_24k - means.view(-1, 1)) / torch.sqrt(vars.view(-1, 1) + 1e-7) + + # MERT SSL 约束 + # 定义每个 chunk 的长度(5 秒的采样点数) + chunk_size = 24000 * 5 # 5 秒,每秒 24000 个采样点 + total_length = mert_input_wavs_mono_24k.shape[1] + + num_chunks_per_audio = (actual_lengths_24k + chunk_size - 1) // chunk_size + + # 分块处理 + all_chunks = [] + chunk_actual_lengths = [] + for i in range(bsz): + audio = mert_input_wavs_mono_24k[i] + actual_length = actual_lengths_24k[i] + for start in range(0, actual_length, chunk_size): + end = min(start + chunk_size, actual_length) + chunk = audio[start:end] + if len(chunk) < chunk_size: + chunk = F.pad(chunk, (0, chunk_size - len(chunk))) # 不足部分用零填充 + all_chunks.append(chunk) + chunk_actual_lengths.append(end - start) + + # 堆叠所有块为 (total_chunks, chunk_size) + all_chunks = torch.stack(all_chunks, dim=0) + + # 批量推理 + with torch.no_grad(): + # 输出形状: (total_chunks, seq_len, hidden_size) + mert_ssl_hidden_states = self.mert_model(all_chunks).last_hidden_state + + # 计算每个块的特征数量 + chunk_num_features = [(length + 319) // 320 for length in chunk_actual_lengths] + + # 裁剪每个块的隐藏状态 + chunk_hidden_states = [mert_ssl_hidden_states[i, :chunk_num_features[i], :] for i in range(len(all_chunks))] + + # 按音频组织隐藏状态 + mert_ssl_hidden_states_list = [] + chunk_idx = 0 + for i in range(bsz): + audio_chunks = chunk_hidden_states[chunk_idx:chunk_idx + num_chunks_per_audio[i]] + audio_hidden = torch.cat(audio_chunks, dim=0) # 拼接同一音频的块 + mert_ssl_hidden_states_list.append(audio_hidden) + chunk_idx += num_chunks_per_audio[i] + + return mert_ssl_hidden_states_list + + + def infer_mhubert_ssl(self, target_wavs, wav_lengths): + # Step 1: Preprocess audio + # Input: N x 2 x T (48kHz, stereo) -> N x T (16kHz, mono) + mhubert_input_wavs_mono_16k = self.resampler_mhubert(target_wavs.mean(dim=1)) + bsz = target_wavs.shape[0] + actual_lengths_16k = wav_lengths // 3 # Convert lengths from 48kHz to 16kHz + + # Step 2: Zero-mean unit-variance normalization (only on actual audio) + means = torch.stack([mhubert_input_wavs_mono_16k[i, :actual_lengths_16k[i]].mean() + for i in range(bsz)]) + vars = torch.stack([mhubert_input_wavs_mono_16k[i, :actual_lengths_16k[i]].var() + for i in range(bsz)]) + mhubert_input_wavs_mono_16k = (mhubert_input_wavs_mono_16k - means.view(-1, 1)) / \ + torch.sqrt(vars.view(-1, 1) + 1e-7) + + # Step 3: Define chunk size for MHubert (30 seconds at 16kHz) + chunk_size = 16000 * 30 # 30 seconds = 480,000 samples + + # Step 4: Split audio into chunks + num_chunks_per_audio = (actual_lengths_16k + chunk_size - 1) // chunk_size # Ceiling division + all_chunks = [] + chunk_actual_lengths = [] + + for i in range(bsz): + audio = mhubert_input_wavs_mono_16k[i] + actual_length = actual_lengths_16k[i] + for start in range(0, actual_length, chunk_size): + end = min(start + chunk_size, actual_length) + chunk = audio[start:end] + if len(chunk) < chunk_size: + chunk = F.pad(chunk, (0, chunk_size - len(chunk))) # Pad with zeros + all_chunks.append(chunk) + chunk_actual_lengths.append(end - start) + + # Step 5: Stack all chunks for batch inference + all_chunks = torch.stack(all_chunks, dim=0) # Shape: (total_chunks, chunk_size) + + # Step 6: Batch inference with MHubert model + with torch.no_grad(): + mhubert_ssl_hidden_states = self.hubert_model(all_chunks).last_hidden_state + # Shape: (total_chunks, seq_len, hidden_size) + + # Step 7: Compute number of features per chunk (assuming model stride of 320) + chunk_num_features = [(length + 319) // 320 for length in chunk_actual_lengths] + + # Step 8: Trim hidden states to remove padding effects + chunk_hidden_states = [mhubert_ssl_hidden_states[i, :chunk_num_features[i], :] for i in range(len(all_chunks))] + + # Step 9: Reorganize hidden states by original audio + mhubert_ssl_hidden_states_list = [] + chunk_idx = 0 + for i in range(bsz): + audio_chunks = chunk_hidden_states[chunk_idx:chunk_idx + num_chunks_per_audio[i]] + audio_hidden = torch.cat(audio_chunks, dim=0) # Concatenate chunks for this audio + mhubert_ssl_hidden_states_list.append(audio_hidden) + chunk_idx += num_chunks_per_audio[i] + return mhubert_ssl_hidden_states_list + + def preprocess(self, batch, train=True): + target_wavs = batch["target_wavs"] + wav_lengths = batch["wav_lengths"] + + dtype = target_wavs.dtype + bs = target_wavs.shape[0] + device = target_wavs.device + + # SSL约束 + mert_ssl_hidden_states = None + mhubert_ssl_hidden_states = None + # is_long = target_wavs.shape[-1] >= 48000 * 150 + if train: + with torch.amp.autocast(device_type="cuda", dtype=dtype): + mert_ssl_hidden_states = self.infer_mert_ssl(target_wavs, wav_lengths) + # mhubert_ssl_hidden_states = self.infer_mhubert_ssl(batch["vocal_wavs"], wav_lengths) + mhubert_ssl_hidden_states = self.infer_mhubert_ssl(target_wavs, wav_lengths) + + # 1: text embedding + texts = batch["prompts"] + encoder_text_hidden_states, text_attention_mask = self.lyric_processor.get_text_embeddings(texts, device) + encoder_text_hidden_states = encoder_text_hidden_states.to(dtype) + + target_latents, _ = self.vae.encode(target_wavs, wav_lengths) + attention_mask = torch.ones(bs, target_latents.shape[-1], device=device, dtype=dtype) + + speaker_embds = batch["speaker_embs"].to(dtype) + keys = batch["keys"] + lyric_token_ids = batch["lyric_token_ids"] + lyric_mask = batch["lyric_masks"] + + # pretrain stage 2 需要 cfg + if train: + full_cfg_condition_mask = torch.where( + (torch.rand(size=(bs,), device=device) < 0.15), + torch.zeros(size=(bs,), device=device), + torch.ones(size=(bs,), device=device) + ).long() + # N x T x 768 + encoder_text_hidden_states = torch.where(full_cfg_condition_mask.unsqueeze(1).unsqueeze(1).bool(), encoder_text_hidden_states, torch.zeros_like(encoder_text_hidden_states)) + + # full_cfg_condition_mask = torch.where( + # (torch.rand(size=(bs,), device=device) < 0.50), + # torch.zeros(size=(bs,), device=device), + # torch.ones(size=(bs,), device=device) + # ).long() + # # N x 512 + # speaker_embds = torch.where(full_cfg_condition_mask.unsqueeze(1).bool(), speaker_embds, torch.zeros_like(speaker_embds)) + + # 歌词 + full_cfg_condition_mask = torch.where( + (torch.rand(size=(bs,), device=device) < 0.15), + torch.zeros(size=(bs,), device=device), + torch.ones(size=(bs,), device=device) + ).long() + lyric_token_ids = torch.where(full_cfg_condition_mask.unsqueeze(1).bool(), lyric_token_ids, torch.zeros_like(lyric_token_ids)) + lyric_mask = torch.where(full_cfg_condition_mask.unsqueeze(1).bool(), lyric_mask, torch.zeros_like(lyric_mask)) + + return ( + keys, + target_latents, + attention_mask, + encoder_text_hidden_states, + text_attention_mask, + speaker_embds, + lyric_token_ids, + lyric_mask, + mert_ssl_hidden_states, + mhubert_ssl_hidden_states, + ) + + def get_scheduler(self): + return FlowMatchEulerDiscreteScheduler( + num_train_timesteps=self.T, + shift=self.hparams.shift, + ) + + def configure_optimizers(self): + # trainable_parameters = self.transformers.get_trainable_parameters() + + # optimizer = get_muon_optimizer( + # self.transformers.named_parameters(), + # lr=self.hparams.learning_rate, + # wd=self.hparams.weight_decay, + # ) + # optimizer = CAME8BitWrapper( + # params=[ + # {'params': self.transformers.parameters()}, + # ], + # lr=self.hparams.learning_rate, + # weight_decay=self.hparams.weight_decay, + # betas=(0.8, 0.9), + # ) + optimizer = torch.optim.AdamW( + params=[ + {'params': self.transformers.parameters()}, + ], + lr=self.hparams.learning_rate, + weight_decay=self.hparams.weight_decay, + betas=(0.8, 0.9), + ) + max_steps = self.hparams.max_steps + # 训练200k + decay_interval = int(max_steps * (1 - 0.9) * 0.2) + lr_scheduler = configure_lr_scheduler(optimizer, total_steps_per_epoch=max_steps, epochs=1, decay_ratio=0.9, decay_interval=decay_interval, warmup_iters=4000) + return [optimizer], lr_scheduler + + def get_sd3_sigmas(self, timesteps, device, n_dim=4, dtype=torch.float32): + sigmas = self.scheduler.sigmas.to(device=device, dtype=dtype) + schedule_timesteps = self.scheduler.timesteps.to(device) + timesteps = timesteps.to(device) + step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] + sigma = sigmas[step_indices].flatten() + while len(sigma.shape) < n_dim: + sigma = sigma.unsqueeze(-1) + return sigma + + def get_timestep(self, bsz, device): + if self.hparams.timestep_densities_type == "logit_normal": + # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$). + # In practice, we sample the random variable u from a normal distribution u ∼ N (u; m, s) + # and map it through the standard logistic function + u = torch.normal(mean=self.hparams.logit_mean, std=self.hparams.logit_std, size=(bsz, ), device="cpu") + u = torch.nn.functional.sigmoid(u) + indices = (u * self.scheduler.config.num_train_timesteps).long() + indices = torch.clamp(indices, 0, self.scheduler.config.num_train_timesteps - 1) + timesteps = self.scheduler.timesteps[indices].to(device) + + if self.hparams.timestep_densities_type == "u_shape": + # 参数 a 决定 U-shaped 程度,论文中 a=4 效果较好 + a = 4.0 + # 从均匀分布采样 v + v = torch.rand(bsz) + + # 计算 u:使用上述解析式 + # u = 0.5 + (1/a)*asinh( sinh(a/2)*(2*v -1) ) + s = torch.sinh(torch.tensor(a/2)) + argument = s * (2 * v - 1) + u = 0.5 + (1.0 / a) * torch.asinh(argument) + + # 数值上可能有极小偏差,保险起见 clamp 一下 + u = torch.clamp(u, 0.0, 1.0) + + # 将连续 [0,1] 的 u 映射到具体的离散 timesteps + indices = (u * self.scheduler.config.num_train_timesteps).long() + indices = torch.clamp(indices, 0, self.scheduler.config.num_train_timesteps - 1) + timesteps = self.scheduler.timesteps[indices].to(device) + + return timesteps + + def run_step(self, batch, batch_idx): + self.plot_step(batch, batch_idx) + ( + keys, + target_latents, + attention_mask, + encoder_text_hidden_states, + text_attention_mask, + speaker_embds, + lyric_token_ids, + lyric_mask, + mert_ssl_hidden_states, + mhubert_ssl_hidden_states, + ) = self.preprocess(batch) + + target_image = target_latents + device = target_image.device + dtype = target_image.dtype + # check dtype + # logger.info(f"target_image dtype: {target_image.dtype} model dtype: {self.transformers.dtype}") + # step 1: 随机生成噪声,初始化设置 + noise = torch.randn_like(target_image, device=device) + bsz = target_image.shape[0] + timesteps = self.get_timestep(bsz, device) + + # Add noise according to flow matching. + sigmas = self.get_sd3_sigmas(timesteps=timesteps, device=device, n_dim=target_image.ndim, dtype=dtype) + noisy_image = sigmas * noise + (1.0 - sigmas) * target_image + + # This is the flow-matching target for vanilla SD3. + target = target_image + + # clap ssl 约束 和vocal_latent_channel2的约束 + all_ssl_hiden_states = [] + if mert_ssl_hidden_states is not None: + all_ssl_hiden_states.append(mert_ssl_hidden_states) + if mhubert_ssl_hidden_states is not None: + all_ssl_hiden_states.append(mhubert_ssl_hidden_states) + + # N x H -> N x c x W x H + x = noisy_image + # step 5: predict noise + transformer_output = self.transformers( + hidden_states=x, + attention_mask=attention_mask, + encoder_text_hidden_states=encoder_text_hidden_states, + text_attention_mask=text_attention_mask, + speaker_embeds=speaker_embds, + lyric_token_idx=lyric_token_ids, + lyric_mask=lyric_mask, + timestep=timesteps.to(device).to(dtype), + ssl_hidden_states=all_ssl_hiden_states, + ) + model_pred = transformer_output.sample + proj_losses = transformer_output.proj_losses + + # Follow: Section 5 of https://arxiv.org/abs/2206.00364. + # Preconditioning of the model outputs. + model_pred = model_pred * (-sigmas) + noisy_image + + # Compute loss. 只有chunk_mask为1,且无padding的地方才计算loss + # N x T x 64 + # chunk_masks_to_cat + # N x T -> N x c x W x T + mask = attention_mask.unsqueeze(1).unsqueeze(1).expand(-1, target_image.shape[1], target_image.shape[2], -1) + + selected_model_pred = (model_pred * mask).reshape(bsz, -1).contiguous() + selected_target = (target * mask).reshape(bsz, -1).contiguous() + + loss = F.mse_loss(selected_model_pred, selected_target, reduction="none") + loss = loss.mean(1) + loss = loss * mask.reshape(bsz, -1).mean(1) + loss = loss.mean() + + prefix = "train" + + self.log(f"{prefix}/denoising_loss", loss, on_step=True, on_epoch=False, prog_bar=True) + + total_proj_loss = 0.0 + for k, v in proj_losses: + self.log(f"{prefix}/{k}_loss", v, on_step=True, on_epoch=False, prog_bar=True) + total_proj_loss += v + + if len(proj_losses) > 0: + total_proj_loss = total_proj_loss / len(proj_losses) + + loss = loss + total_proj_loss * self.ssl_coeff + self.log(f"{prefix}/loss", loss, on_step=True, on_epoch=False, prog_bar=True) + + learning_rate = self.lr_schedulers().get_last_lr()[0] + self.log(f"{prefix}/learning_rate", learning_rate, on_step=True, on_epoch=False, prog_bar=True) + # with torch.autograd.detect_anomaly(): + # self.manual_backward(loss) + return loss + + def training_step(self, batch, batch_idx): + return self.run_step(batch, batch_idx) + + @torch.no_grad() + def diffusion_process( + self, + duration, + encoder_text_hidden_states, + text_attention_mask, + speaker_embds, + lyric_token_ids, + lyric_mask, + random_generators=None, + infer_steps=60, + guidance_scale=15.0, + omega_scale=10.0, + ): + + do_classifier_free_guidance = True + if guidance_scale == 0.0 or guidance_scale == 1.0: + do_classifier_free_guidance = False + + device = encoder_text_hidden_states.device + dtype = encoder_text_hidden_states.dtype + bsz = encoder_text_hidden_states.shape[0] + + scheduler = FlowMatchEulerDiscreteScheduler( + num_train_timesteps=1000, + shift=3.0, + ) + + frame_length = int(duration * 44100 / 512 / 8) + timesteps, num_inference_steps = retrieve_timesteps(scheduler, num_inference_steps=infer_steps, device=device, timesteps=None) + + target_latents = randn_tensor(shape=(bsz, 8, 16, frame_length), generator=random_generators, device=device, dtype=dtype) + attention_mask = torch.ones(bsz, frame_length, device=device, dtype=dtype) + if do_classifier_free_guidance: + attention_mask = torch.cat([attention_mask] * 2, dim=0) + encoder_text_hidden_states = torch.cat([encoder_text_hidden_states, torch.zeros_like(encoder_text_hidden_states)], 0) + text_attention_mask = torch.cat([text_attention_mask] * 2, dim=0) + + speaker_embds = torch.cat([speaker_embds, torch.zeros_like(speaker_embds)], 0) + + lyric_token_ids = torch.cat([lyric_token_ids, torch.zeros_like(lyric_token_ids)], 0) + lyric_mask = torch.cat([lyric_mask, torch.zeros_like(lyric_mask)], 0) + + momentum_buffer = MomentumBuffer() + + for i, t in tqdm(enumerate(timesteps), total=num_inference_steps): + # expand the latents if we are doing classifier free guidance + latents = target_latents + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + timestep = t.expand(latent_model_input.shape[0]) + noise_pred = self.transformers( + hidden_states=latent_model_input, + attention_mask=attention_mask, + encoder_text_hidden_states=encoder_text_hidden_states, + text_attention_mask=text_attention_mask, + speaker_embeds=speaker_embds, + lyric_token_idx=lyric_token_ids, + lyric_mask=lyric_mask, + timestep=timestep, + ).sample + + if do_classifier_free_guidance: + noise_pred_with_cond, noise_pred_uncond = noise_pred.chunk(2) + noise_pred = apg_forward( + pred_cond=noise_pred_with_cond, + pred_uncond=noise_pred_uncond, + guidance_scale=guidance_scale, + momentum_buffer=momentum_buffer, + ) + + target_latents = scheduler.step(model_output=noise_pred, timestep=t, sample=target_latents, return_dict=False, omega=omega_scale)[0] + + return target_latents + + def predict_step(self, batch, batch_idx): + ( + keys, + target_latents, + attention_mask, + encoder_text_hidden_states, + text_attention_mask, + speaker_embds, + lyric_token_ids, + lyric_mask, + mert_ssl_hidden_states, + mhubert_ssl_hidden_states, + ) = self.preprocess(batch, train=False) + + infer_steps = 60 + guidance_scale = 15.0 + omega_scale = 10.0 + seed_num = 1234 + random.seed(seed_num) + bsz = target_latents.shape[0] + random_generators = [torch.Generator(device=self.device) for _ in range(bsz)] + seeds = [] + for i in range(bsz): + seed = random.randint(0, 2**32 - 1) + random_generators[i].manual_seed(seed) + seeds.append(seed) + duration = self.hparams.fix_cut_level * 10 + pred_latents = self.diffusion_process( + duration=duration, + encoder_text_hidden_states=encoder_text_hidden_states, + text_attention_mask=text_attention_mask, + speaker_embds=speaker_embds, + lyric_token_ids=lyric_token_ids, + lyric_mask=lyric_mask, + random_generators=random_generators, + infer_steps=infer_steps, + guidance_scale=guidance_scale, + omega_scale=omega_scale, + ) + + audio_lengths = batch["wav_lengths"] + sr, pred_wavs = self.vae.decode(pred_latents, audio_lengths=audio_lengths, sr=48000) + return { + "target_wavs": batch["target_wavs"], + "pred_wavs": pred_wavs, + "keys": keys, + "prompts": batch["prompts"], + "candidate_lyric_chunks": batch["candidate_lyric_chunks"], + "sr": sr, + "seeds": seeds, + } + + def construct_lyrics(self, candidate_lyric_chunk): + lyrics = [] + for chunk in candidate_lyric_chunk: + lyrics.append(chunk["lyric"]) + + lyrics = "\n".join(lyrics) + return lyrics + + def plot_step(self, batch, batch_idx): + + if batch_idx % self.hparams.every_plot_step != 0 or self.local_rank != 0 or torch.distributed.get_rank() != 0 or torch.cuda.current_device() != 0: + return + results = self.predict_step(batch, batch_idx) + + target_wavs = results["target_wavs"] + pred_wavs = results["pred_wavs"] + keys = results["keys"] + prompts = results["prompts"] + candidate_lyric_chunks = results["candidate_lyric_chunks"] + sr = results["sr"] + seeds = results["seeds"] + i = 0 + for key, target_wav, pred_wav, prompt, candidate_lyric_chunk, seed in zip(keys, target_wavs, pred_wavs, prompts, candidate_lyric_chunks, seeds): + key = key + prompt = prompt + lyric = self.construct_lyrics(candidate_lyric_chunk) + key_prompt_lyric = f"# KEY\n\n{key}\n\n\n# PROMPT\n\n{prompt}\n\n\n# LYRIC\n\n{lyric}\n\n# SEED\n\n{seed}\n\n" + log_dir = self.logger.log_dir + save_dir = f"{log_dir}/eval_results/step_{self.global_step}" + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + torchaudio.save(f"{save_dir}/target_wav_{key}_{i}.flac", target_wav.float().cpu(), sr) + torchaudio.save(f"{save_dir}/pred_wav_{key}_{i}.flac", pred_wav.float().cpu(), sr) + with open(f"{save_dir}/key_prompt_lyric_{key}_{i}.txt", "w") as f: + f.write(key_prompt_lyric) + i += 1 diff --git a/models/attention.py b/models/attention.py new file mode 100644 index 0000000..056b9ba --- /dev/null +++ b/models/attention.py @@ -0,0 +1,1278 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Dict, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from diffusers.utils import deprecate, logging +from diffusers.utils.torch_utils import maybe_allow_in_graph +from diffusers.models.activations import GEGLU, GELU, ApproximateGELU +from diffusers.models.embeddings import SinusoidalPositionalEmbedding +from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm, AdaLayerNormZeroSingle + + +try: + # from .dcformer import DCMHAttention + from .customer_attention_processor import Attention, CustomJointAttnProcessor2_0, CustomLiteLAProcessor2_0, CustomerAttnProcessor2_0, CustomLiteLAMMDiTProcessor2_0 +except ImportError: + # from dcformer import DCMHAttention + from customer_attention_processor import Attention, CustomJointAttnProcessor2_0, CustomLiteLAProcessor2_0, CustomerAttnProcessor2_0, CustomLiteLAMMDiTProcessor2_0 + + +logger = logging.get_logger(__name__) + + +def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int): + # "feed_forward_chunk_size" can be used to save memory + if hidden_states.shape[chunk_dim] % chunk_size != 0: + raise ValueError( + f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`." + ) + + num_chunks = hidden_states.shape[chunk_dim] // chunk_size + ff_output = torch.cat( + [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)], + dim=chunk_dim, + ) + return ff_output + + +@maybe_allow_in_graph +class GatedSelfAttentionDense(nn.Module): + r""" + A gated self-attention dense layer that combines visual features and object features. + + Parameters: + query_dim (`int`): The number of channels in the query. + context_dim (`int`): The number of channels in the context. + n_heads (`int`): The number of heads to use for attention. + d_head (`int`): The number of channels in each head. + """ + + def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int): + super().__init__() + + # we need a linear projection since we need cat visual feature and obj feature + self.linear = nn.Linear(context_dim, query_dim) + + self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head) + self.ff = FeedForward(query_dim, activation_fn="geglu") + + self.norm1 = nn.LayerNorm(query_dim) + self.norm2 = nn.LayerNorm(query_dim) + + self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0))) + self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0))) + + self.enabled = True + + def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor: + if not self.enabled: + return x + + n_visual = x.shape[1] + objs = self.linear(objs) + + x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)))[:, :n_visual, :] + x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x)) + + return x + + +@maybe_allow_in_graph +class BasicTransformerBlock(nn.Module): + r""" + A basic Transformer block. + + Parameters: + dim (`int`): The number of channels in the input and output. + num_attention_heads (`int`): The number of heads to use for multi-head attention. + attention_head_dim (`int`): The number of channels in each head. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + num_embeds_ada_norm (: + obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`. + attention_bias (: + obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter. + only_cross_attention (`bool`, *optional*): + Whether to use only cross-attention layers. In this case two cross attention layers are used. + double_self_attention (`bool`, *optional*): + Whether to use two self-attention layers. In this case no cross attention layers are used. + upcast_attention (`bool`, *optional*): + Whether to upcast the attention computation to float32. This is useful for mixed precision training. + norm_elementwise_affine (`bool`, *optional*, defaults to `True`): + Whether to use learnable elementwise affine parameters for normalization. + norm_type (`str`, *optional*, defaults to `"layer_norm"`): + The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`. + final_dropout (`bool` *optional*, defaults to False): + Whether to apply a final dropout after the last feed-forward layer. + attention_type (`str`, *optional*, defaults to `"default"`): + The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`. + positional_embeddings (`str`, *optional*, defaults to `None`): + The type of positional embeddings to apply to. + num_positional_embeddings (`int`, *optional*, defaults to `None`): + The maximum number of positional embeddings to apply. + """ + + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + dropout=0.0, + cross_attention_dim: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + attention_bias: bool = False, + only_cross_attention: bool = False, + double_self_attention: bool = False, + upcast_attention: bool = False, + norm_elementwise_affine: bool = True, + norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen' + norm_eps: float = 1e-5, + final_dropout: bool = False, + attention_type: str = "default", + positional_embeddings: Optional[str] = None, + num_positional_embeddings: Optional[int] = None, + ada_norm_continous_conditioning_embedding_dim: Optional[int] = None, + ada_norm_bias: Optional[int] = None, + ff_inner_dim: Optional[int] = None, + ff_bias: bool = True, + attention_out_bias: bool = True, + use_rms_norm: bool = False, + ): + super().__init__() + self.only_cross_attention = only_cross_attention + self.use_rms_norm = use_rms_norm + + # We keep these boolean flags for backward-compatibility. + self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero" + self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm" + self.use_ada_layer_norm_single = norm_type == "ada_norm_single" + self.use_layer_norm = norm_type == "layer_norm" + self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous" + + if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None: + raise ValueError( + f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to" + f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}." + ) + + self.norm_type = norm_type + self.num_embeds_ada_norm = num_embeds_ada_norm + + if positional_embeddings and (num_positional_embeddings is None): + raise ValueError( + "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined." + ) + + if positional_embeddings == "sinusoidal": + self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings) + else: + self.pos_embed = None + + # Define 3 blocks. Each block has its own normalization layer. + # 1. Self-Attn + if norm_type == "ada_norm": + self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) + elif norm_type == "ada_norm_zero": + self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm) + elif norm_type == "ada_norm_continuous": + self.norm1 = AdaLayerNormContinuous( + dim, + ada_norm_continous_conditioning_embedding_dim, + norm_elementwise_affine, + norm_eps, + ada_norm_bias, + "rms_norm", + ) + else: + if use_rms_norm: + self.norm1 = RMSNorm(dim, norm_eps, norm_elementwise_affine) + else: + self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps) + + self.attn1 = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + cross_attention_dim=cross_attention_dim if only_cross_attention else None, + upcast_attention=upcast_attention, + out_bias=attention_out_bias, + ) + + # 2. Cross-Attn + if cross_attention_dim is not None or double_self_attention: + # We currently only use AdaLayerNormZero for self attention where there will only be one attention block. + # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during + # the second cross attention block. + if norm_type == "ada_norm": + self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm) + elif norm_type == "ada_norm_continuous": + self.norm2 = AdaLayerNormContinuous( + dim, + ada_norm_continous_conditioning_embedding_dim, + norm_elementwise_affine, + norm_eps, + ada_norm_bias, + "rms_norm", + ) + else: + if use_rms_norm: + self.norm2 = RMSNorm(dim, norm_eps, norm_elementwise_affine) + else: + self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine) + + self.attn2 = Attention( + query_dim=dim, + cross_attention_dim=cross_attention_dim if not double_self_attention else None, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + out_bias=attention_out_bias, + ) # is self-attn if encoder_hidden_states is none + else: + self.norm2 = None + self.attn2 = None + + # 3. Feed-forward + if norm_type == "ada_norm_continuous": + self.norm3 = AdaLayerNormContinuous( + dim, + ada_norm_continous_conditioning_embedding_dim, + norm_elementwise_affine, + norm_eps, + ada_norm_bias, + "layer_norm", + ) + + elif norm_type in ["ada_norm_zero", "ada_norm", "layer_norm", "ada_norm_continuous"]: + if use_rms_norm: + self.norm3 = RMSNorm(dim, norm_eps, norm_elementwise_affine) + else: + self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine) + elif norm_type == "layer_norm_i2vgen": + self.norm3 = None + + self.ff = FeedForward( + dim, + dropout=dropout, + activation_fn=activation_fn, + final_dropout=final_dropout, + inner_dim=ff_inner_dim, + bias=ff_bias, + ) + + # 4. Fuser + if attention_type == "gated" or attention_type == "gated-text-image": + self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim) + + # 5. Scale-shift for PixArt-Alpha. + if norm_type == "ada_norm_single": + self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5) + + # let chunk size default to None + self._chunk_size = None + self._chunk_dim = 0 + + def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0): + # Sets chunk feed-forward + self._chunk_size = chunk_size + self._chunk_dim = dim + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + timestep: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + class_labels: Optional[torch.LongTensor] = None, + added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, + ) -> torch.FloatTensor: + if cross_attention_kwargs is not None: + if cross_attention_kwargs.get("scale", None) is not None: + logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.") + + # Notice that normalization is always applied before the real computation in the following blocks. + # 0. Self-Attention + batch_size = hidden_states.shape[0] + + if self.norm_type == "ada_norm": + norm_hidden_states = self.norm1(hidden_states, timestep) + elif self.norm_type == "ada_norm_zero": + norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1( + hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype + ) + elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]: + norm_hidden_states = self.norm1(hidden_states) + elif self.norm_type == "ada_norm_continuous": + norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"]) + elif self.norm_type == "ada_norm_single": + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( + self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1) + ).chunk(6, dim=1) + norm_hidden_states = self.norm1(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa + norm_hidden_states = norm_hidden_states.squeeze(1) + else: + raise ValueError("Incorrect norm used") + + if self.pos_embed is not None: + norm_hidden_states = self.pos_embed(norm_hidden_states) + + # 1. Prepare GLIGEN inputs + cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {} + gligen_kwargs = cross_attention_kwargs.pop("gligen", None) + + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + if self.norm_type == "ada_norm_zero": + attn_output = gate_msa.unsqueeze(1) * attn_output + elif self.norm_type == "ada_norm_single": + attn_output = gate_msa * attn_output + + hidden_states = attn_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + # 1.2 GLIGEN Control + if gligen_kwargs is not None: + hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"]) + + # 3. Cross-Attention + if self.attn2 is not None: + if self.norm_type == "ada_norm": + norm_hidden_states = self.norm2(hidden_states, timestep) + elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]: + norm_hidden_states = self.norm2(hidden_states) + elif self.norm_type == "ada_norm_single": + # For PixArt norm2 isn't applied here: + # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103 + norm_hidden_states = hidden_states + elif self.norm_type == "ada_norm_continuous": + norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"]) + else: + raise ValueError("Incorrect norm") + + if self.pos_embed is not None and self.norm_type != "ada_norm_single": + norm_hidden_states = self.pos_embed(norm_hidden_states) + + attn_output = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + **cross_attention_kwargs, + ) + hidden_states = attn_output + hidden_states + + # 4. Feed-forward + # i2vgen doesn't have this norm 🤷‍♂️ + if self.norm_type == "ada_norm_continuous": + norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"]) + elif not self.norm_type == "ada_norm_single": + norm_hidden_states = self.norm3(hidden_states) + + if self.norm_type == "ada_norm_zero": + norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + + if self.norm_type == "ada_norm_single" and self.norm2 is not None: + norm_hidden_states = self.norm2(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp + + if self._chunk_size is not None: + # "feed_forward_chunk_size" can be used to save memory + ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size) + else: + ff_output = self.ff(norm_hidden_states) + + if self.norm_type == "ada_norm_zero": + ff_output = gate_mlp.unsqueeze(1) * ff_output + elif self.norm_type == "ada_norm_single": + ff_output = gate_mlp * ff_output + + hidden_states = ff_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + return hidden_states + + +@maybe_allow_in_graph +class TemporalBasicTransformerBlock(nn.Module): + r""" + A basic Transformer block for video like data. + + Parameters: + dim (`int`): The number of channels in the input and output. + time_mix_inner_dim (`int`): The number of channels for temporal attention. + num_attention_heads (`int`): The number of heads to use for multi-head attention. + attention_head_dim (`int`): The number of channels in each head. + cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention. + """ + + def __init__( + self, + dim: int, + time_mix_inner_dim: int, + num_attention_heads: int, + attention_head_dim: int, + cross_attention_dim: Optional[int] = None, + ): + super().__init__() + self.is_res = dim == time_mix_inner_dim + + self.norm_in = nn.LayerNorm(dim) + + # Define 3 blocks. Each block has its own normalization layer. + # 1. Self-Attn + self.ff_in = FeedForward( + dim, + dim_out=time_mix_inner_dim, + activation_fn="geglu", + ) + + self.norm1 = nn.LayerNorm(time_mix_inner_dim) + self.attn1 = Attention( + query_dim=time_mix_inner_dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + cross_attention_dim=None, + ) + + # 2. Cross-Attn + if cross_attention_dim is not None: + # We currently only use AdaLayerNormZero for self attention where there will only be one attention block. + # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during + # the second cross attention block. + self.norm2 = nn.LayerNorm(time_mix_inner_dim) + self.attn2 = Attention( + query_dim=time_mix_inner_dim, + cross_attention_dim=cross_attention_dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + ) # is self-attn if encoder_hidden_states is none + else: + self.norm2 = None + self.attn2 = None + + # 3. Feed-forward + self.norm3 = nn.LayerNorm(time_mix_inner_dim) + self.ff = FeedForward(time_mix_inner_dim, activation_fn="geglu") + + # let chunk size default to None + self._chunk_size = None + self._chunk_dim = None + + def set_chunk_feed_forward(self, chunk_size: Optional[int], **kwargs): + # Sets chunk feed-forward + self._chunk_size = chunk_size + # chunk dim should be hardcoded to 1 to have better speed vs. memory trade-off + self._chunk_dim = 1 + + def forward( + self, + hidden_states: torch.FloatTensor, + num_frames: int, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + # Notice that normalization is always applied before the real computation in the following blocks. + # 0. Self-Attention + batch_size = hidden_states.shape[0] + + batch_frames, seq_length, channels = hidden_states.shape + batch_size = batch_frames // num_frames + + hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, seq_length, channels) + hidden_states = hidden_states.permute(0, 2, 1, 3) + hidden_states = hidden_states.reshape(batch_size * seq_length, num_frames, channels) + + residual = hidden_states + hidden_states = self.norm_in(hidden_states) + + if self._chunk_size is not None: + hidden_states = _chunked_feed_forward(self.ff_in, hidden_states, self._chunk_dim, self._chunk_size) + else: + hidden_states = self.ff_in(hidden_states) + + if self.is_res: + hidden_states = hidden_states + residual + + norm_hidden_states = self.norm1(hidden_states) + attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None) + hidden_states = attn_output + hidden_states + + # 3. Cross-Attention + if self.attn2 is not None: + norm_hidden_states = self.norm2(hidden_states) + attn_output = self.attn2(norm_hidden_states, encoder_hidden_states=encoder_hidden_states) + hidden_states = attn_output + hidden_states + + # 4. Feed-forward + norm_hidden_states = self.norm3(hidden_states) + + if self._chunk_size is not None: + ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size) + else: + ff_output = self.ff(norm_hidden_states) + + if self.is_res: + hidden_states = ff_output + hidden_states + else: + hidden_states = ff_output + + hidden_states = hidden_states[None, :].reshape(batch_size, seq_length, num_frames, channels) + hidden_states = hidden_states.permute(0, 2, 1, 3) + hidden_states = hidden_states.reshape(batch_size * num_frames, seq_length, channels) + + return hidden_states + + +class SkipFFTransformerBlock(nn.Module): + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + kv_input_dim: int, + kv_input_dim_proj_use_bias: bool, + dropout=0.0, + cross_attention_dim: Optional[int] = None, + attention_bias: bool = False, + attention_out_bias: bool = True, + ): + super().__init__() + if kv_input_dim != dim: + self.kv_mapper = nn.Linear(kv_input_dim, dim, kv_input_dim_proj_use_bias) + else: + self.kv_mapper = None + + self.norm1 = RMSNorm(dim, 1e-06) + + self.attn1 = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + cross_attention_dim=cross_attention_dim, + out_bias=attention_out_bias, + ) + + self.norm2 = RMSNorm(dim, 1e-06) + + self.attn2 = Attention( + query_dim=dim, + cross_attention_dim=cross_attention_dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + out_bias=attention_out_bias, + ) + + def forward(self, hidden_states, encoder_hidden_states, cross_attention_kwargs): + cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {} + + if self.kv_mapper is not None: + encoder_hidden_states = self.kv_mapper(F.silu(encoder_hidden_states)) + + norm_hidden_states = self.norm1(hidden_states) + + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + **cross_attention_kwargs, + ) + + hidden_states = attn_output + hidden_states + + norm_hidden_states = self.norm2(hidden_states) + + attn_output = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + **cross_attention_kwargs, + ) + + hidden_states = attn_output + hidden_states + + return hidden_states + + +class FeedForward(nn.Module): + r""" + A feed-forward layer. + + Parameters: + dim (`int`): The number of channels in the input. + dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`. + mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + final_dropout (`bool` *optional*, defaults to False): Apply a final dropout. + bias (`bool`, defaults to True): Whether to use a bias in the linear layer. + """ + + def __init__( + self, + dim: int, + dim_out: Optional[int] = None, + mult: int = 4, + dropout: float = 0.0, + activation_fn: str = "geglu", + final_dropout: bool = False, + inner_dim=None, + bias: bool = True, + ): + super().__init__() + if inner_dim is None: + inner_dim = int(dim * mult) + dim_out = dim_out if dim_out is not None else dim + linear_cls = nn.Linear + + if activation_fn == "gelu": + act_fn = GELU(dim, inner_dim, bias=bias) + if activation_fn == "gelu-approximate": + act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias) + elif activation_fn == "geglu": + act_fn = GEGLU(dim, inner_dim, bias=bias) + elif activation_fn == "geglu-approximate": + act_fn = ApproximateGELU(dim, inner_dim, bias=bias) + + self.net = nn.ModuleList([]) + # project in + self.net.append(act_fn) + # project dropout + self.net.append(nn.Dropout(dropout)) + # project out + self.net.append(linear_cls(inner_dim, dim_out, bias=bias)) + # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout + if final_dropout: + self.net.append(nn.Dropout(dropout)) + + def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor: + if len(args) > 0 or kwargs.get("scale", None) is not None: + deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." + deprecate("scale", "1.0.0", deprecation_message) + for module in self.net: + hidden_states = module(hidden_states) + return hidden_states + + +class AdaRMSNormZero(nn.Module): + r""" + Norm layer adaptive layer norm zero (adaLN-Zero). + + Parameters: + embedding_dim (`int`): The size of each embedding vector. + num_embeddings (`int`): The size of the embeddings dictionary. + """ + + def __init__(self, embedding_dim: int, num_embeddings: Optional[int] = None): + super().__init__() + self.emb = None + self.silu = nn.SiLU() + self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True) + self.norm = RMSNorm(embedding_dim, eps=1e-6, elementwise_affine=False) + + def forward( + self, + x: torch.Tensor, + timestep: Optional[torch.Tensor] = None, + class_labels: Optional[torch.LongTensor] = None, + hidden_dtype: Optional[torch.dtype] = None, + emb: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + if self.emb is not None: + emb = self.emb(timestep, class_labels, hidden_dtype=hidden_dtype) + emb = self.linear(self.silu(emb)) + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=1) + x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None] + return x, gate_msa, shift_mlp, scale_mlp, gate_mlp + + +@maybe_allow_in_graph +class JointTransformerBlock(nn.Module): + r""" + A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3. + + Reference: https://arxiv.org/abs/2403.03206 + + Parameters: + dim (`int`): The number of channels in the input and output. + num_attention_heads (`int`): The number of heads to use for multi-head attention. + attention_head_dim (`int`): The number of channels in each head. + context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the + processing of `context` conditions. + """ + + def __init__(self, dim, num_attention_heads, attention_head_dim, context_pre_only=False, use_mmdit=True, cross_attention_dim=None): + super().__init__() + + self.context_pre_only = context_pre_only + context_norm_type = "ada_norm_continous" if context_pre_only else "ada_norm_zero" + + self.norm1 = AdaRMSNormZero(dim) + + if context_norm_type == "ada_norm_continous" and use_mmdit: + self.norm1_context = AdaLayerNormContinuous( + dim, dim, elementwise_affine=False, eps=1e-6, bias=True, norm_type="rms_norm" + ) + elif context_norm_type == "ada_norm_zero" and use_mmdit: + self.norm1_context = AdaRMSNormZero(dim) + + if hasattr(F, "scaled_dot_product_attention"): + processor = CustomJointAttnProcessor2_0() + else: + raise ValueError( + "The current PyTorch version does not support the `scaled_dot_product_attention` function." + ) + self.attn = Attention( + query_dim=dim, + cross_attention_dim=cross_attention_dim, + added_kv_proj_dim=dim if use_mmdit else None, + dim_head=attention_head_dim // num_attention_heads, + heads=num_attention_heads, + out_dim=attention_head_dim, + context_pre_only=context_pre_only, + bias=True, + qk_norm="rms_norm", + processor=processor, + ) + + self.norm2 = RMSNorm(dim, 1e-06, elementwise_affine=False) + self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate") + + if not context_pre_only: + self.norm2_context = RMSNorm(dim, 1e-06, elementwise_affine=False) + self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate") + else: + self.norm2_context = None + self.ff_context = None + + # let chunk size default to None + self._chunk_size = None + self._chunk_dim = 0 + + # Copied from diffusers.models.attention.BasicTransformerBlock.set_chunk_feed_forward + def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0): + # Sets chunk feed-forward + self._chunk_size = chunk_size + self._chunk_dim = dim + + def forward( + self, + hidden_states: torch.FloatTensor, + temb: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor = None, + rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None, + rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None, + ): + norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb) + + norm_encoder_hidden_states = None + if encoder_hidden_states is not None: + if self.context_pre_only: + norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states, temb) + else: + norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context( + encoder_hidden_states, emb=temb + ) + + # Attention. + attn_output, context_attn_output = self.attn( + hidden_states=norm_hidden_states, + encoder_hidden_states=norm_encoder_hidden_states, + rotary_freqs_cis=rotary_freqs_cis, + rotary_freqs_cis_cross=rotary_freqs_cis_cross, + ) + + # Process attention outputs for the `hidden_states`. + attn_output = gate_msa.unsqueeze(1) * attn_output + hidden_states = hidden_states + attn_output + + norm_hidden_states = self.norm2(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + if self._chunk_size is not None: + # "feed_forward_chunk_size" can be used to save memory + ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size) + else: + ff_output = self.ff(norm_hidden_states) + ff_output = gate_mlp.unsqueeze(1) * ff_output + + hidden_states = hidden_states + ff_output + + # Process attention outputs for the `encoder_hidden_states`. + if self.context_pre_only or encoder_hidden_states is None: + encoder_hidden_states = None + if not self.context_pre_only and encoder_hidden_states is not None: + context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output + encoder_hidden_states = encoder_hidden_states + context_attn_output + + norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states) + norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None] + if self._chunk_size is not None: + # "feed_forward_chunk_size" can be used to save memory + context_ff_output = _chunked_feed_forward( + self.ff_context, norm_encoder_hidden_states, self._chunk_dim, self._chunk_size + ) + else: + context_ff_output = self.ff_context(norm_encoder_hidden_states) + encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output + + return encoder_hidden_states, hidden_states + + +def val2list(x: list or tuple or any, repeat_time=1) -> list: # type: ignore + """Repeat `val` for `repeat_time` times and return the list or val if list/tuple.""" + if isinstance(x, (list, tuple)): + return list(x) + return [x for _ in range(repeat_time)] + + +def val2tuple(x: list or tuple or any, min_len: int = 1, idx_repeat: int = -1) -> tuple: # type: ignore + """Return tuple with min_len by repeating element at idx_repeat.""" + # convert to list first + x = val2list(x) + + # repeat elements if necessary + if len(x) > 0: + x[idx_repeat:idx_repeat] = [x[idx_repeat] for _ in range(min_len - len(x))] + + return tuple(x) + + +def get_same_padding(kernel_size: Union[int, Tuple[int, ...]]) -> Union[int, Tuple[int, ...]]: + if isinstance(kernel_size, tuple): + return tuple([get_same_padding(ks) for ks in kernel_size]) + else: + assert kernel_size % 2 > 0, f"kernel size {kernel_size} should be odd number" + return kernel_size // 2 + + +class ConvLayer(nn.Module): + def __init__( + self, + in_dim: int, + out_dim: int, + kernel_size=3, + stride=1, + dilation=1, + groups=1, + padding: Union[int, None] = None, + use_bias=False, + norm=None, + act=None, + ): + super().__init__() + if padding is None: + padding = get_same_padding(kernel_size) + padding *= dilation + + self.in_dim = in_dim + self.out_dim = out_dim + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + self.groups = groups + self.padding = padding + self.use_bias = use_bias + + self.conv = nn.Conv1d( + in_dim, + out_dim, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=use_bias, + ) + if norm is not None: + self.norm = RMSNorm(out_dim, elementwise_affine=False) + else: + self.norm = None + if act is not None: + self.act = nn.SiLU(inplace=True) + else: + self.act = None + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.conv(x) + if self.norm: + x = self.norm(x) + if self.act: + x = self.act(x) + return x + + +class GLUMBConv(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: int, + out_feature=None, + kernel_size=3, + stride=1, + padding: Union[int, None] = None, + use_bias=False, + norm=(None, None, None), + act=("silu", "silu", None), + dilation=1, + ): + out_feature = out_feature or in_features + super().__init__() + use_bias = val2tuple(use_bias, 3) + norm = val2tuple(norm, 3) + act = val2tuple(act, 3) + + self.glu_act = nn.SiLU(inplace=False) + self.inverted_conv = ConvLayer( + in_features, + hidden_features * 2, + 1, + use_bias=use_bias[0], + norm=norm[0], + act=act[0], + ) + self.depth_conv = ConvLayer( + hidden_features * 2, + hidden_features * 2, + kernel_size, + stride=stride, + groups=hidden_features * 2, + padding=padding, + use_bias=use_bias[1], + norm=norm[1], + act=None, + dilation=dilation, + ) + self.point_conv = ConvLayer( + hidden_features, + out_feature, + 1, + use_bias=use_bias[2], + norm=norm[2], + act=act[2], + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.transpose(1, 2) + x = self.inverted_conv(x) + x = self.depth_conv(x) + + x, gate = torch.chunk(x, 2, dim=1) + gate = self.glu_act(gate) + x = x * gate + + x = self.point_conv(x) + x = x.transpose(1, 2) + + return x + + +def t2i_modulate(x, shift, scale): + return x * (1 + scale) + shift + + +class LinearTransformerBlock(nn.Module): + """ + A Sana block with global shared adaptive layer norm (adaLN-single) conditioning. + """ + def __init__( + self, + dim, + num_attention_heads, + attention_head_dim, + use_adaln_single=True, + cross_attention_dim=None, + added_kv_proj_dim=None, + context_pre_only=False, + mlp_ratio=4.0, + add_cross_attention=False, + add_cross_attention_dim=None, + qk_norm=None, + ): + super().__init__() + + self.norm1 = RMSNorm(dim, elementwise_affine=False, eps=1e-6) + self.attn = Attention( + query_dim=dim, + cross_attention_dim=cross_attention_dim, + added_kv_proj_dim=added_kv_proj_dim, + dim_head=attention_head_dim, + heads=num_attention_heads, + out_dim=dim, + bias=True, + qk_norm=qk_norm, + processor=CustomLiteLAProcessor2_0(), + ) + + self.add_cross_attention = add_cross_attention + self.context_pre_only = context_pre_only + + if add_cross_attention and add_cross_attention_dim is not None: + self.cross_attn = Attention( + query_dim=dim, + cross_attention_dim=add_cross_attention_dim, + added_kv_proj_dim=add_cross_attention_dim, + dim_head=attention_head_dim, + heads=num_attention_heads, + out_dim=dim, + context_pre_only=context_pre_only, + bias=True, + qk_norm=qk_norm, + processor=CustomerAttnProcessor2_0(), + ) + + self.norm2 = RMSNorm(dim, 1e-06, elementwise_affine=False) + + self.ff = GLUMBConv( + in_features=dim, + hidden_features=int(dim * mlp_ratio), + use_bias=(True, True, False), + norm=(None, None, None), + act=("silu", "silu", None), + ) + self.use_adaln_single = use_adaln_single + if use_adaln_single: + self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5) + + def forward( + self, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor = None, + attention_mask: torch.FloatTensor = None, + encoder_attention_mask: torch.FloatTensor = None, + rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None, + rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None, + temb: torch.FloatTensor = None, + ): + + N = hidden_states.shape[0] + + # step 1: AdaLN single + if self.use_adaln_single: + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( + self.scale_shift_table[None] + temb.reshape(N, 6, -1) + ).chunk(6, dim=1) + + norm_hidden_states = self.norm1(hidden_states) + if self.use_adaln_single: + norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa + + # step 2: attention + if not self.add_cross_attention: + attn_output, encoder_hidden_states = self.attn( + hidden_states=norm_hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + rotary_freqs_cis=rotary_freqs_cis, + rotary_freqs_cis_cross=rotary_freqs_cis_cross, + ) + else: + attn_output, _ = self.attn( + hidden_states=norm_hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=None, + encoder_attention_mask=None, + rotary_freqs_cis=rotary_freqs_cis, + rotary_freqs_cis_cross=None, + ) + + if self.use_adaln_single: + attn_output = gate_msa * attn_output + hidden_states = attn_output + hidden_states + + if self.add_cross_attention: + attn_output = self.cross_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + rotary_freqs_cis=rotary_freqs_cis, + rotary_freqs_cis_cross=rotary_freqs_cis_cross, + ) + hidden_states = attn_output + hidden_states + + # step 3: add norm + norm_hidden_states = self.norm2(hidden_states) + if self.use_adaln_single: + norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp + + # step 4: feed forward + ff_output = self.ff(norm_hidden_states) + if self.use_adaln_single: + ff_output = gate_mlp * ff_output + + hidden_states = hidden_states + ff_output + + return hidden_states + + +class LinearMMDiTBlock(nn.Module): + """ + A Sana block with global shared adaptive layer norm (adaLN-single) conditioning. + """ + def __init__( + self, + dim, + num_attention_heads, + attention_head_dim, + mlp_ratio=4.0, + is_single=True, + ): + super().__init__() + self.is_single = is_single + + if is_single: + self.mlp_hidden_dim = int(dim * mlp_ratio) + self.norm = AdaLayerNormZeroSingle(dim) + self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim) + self.act_mlp = nn.GELU(approximate="tanh") + self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim) + processor = CustomLiteLAMMDiTProcessor2_0() + self.attn = Attention( + query_dim=dim, + cross_attention_dim=None, + dim_head=attention_head_dim, + heads=num_attention_heads, + out_dim=dim, + bias=True, + processor=processor, + qk_norm="rms_norm", + eps=1e-6, + pre_only=True, + ) + else: + self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5) + self.scale_shift_table_context = nn.Parameter(torch.randn(6, dim) / dim**0.5) + self.norm1 = RMSNorm(dim, elementwise_affine=False, eps=1e-6) + self.norm1_context = RMSNorm(dim, elementwise_affine=False, eps=1e-6) + processor = CustomLiteLAMMDiTProcessor2_0() + self.attn = Attention( + query_dim=dim, + cross_attention_dim=None, + added_kv_proj_dim=dim, + dim_head=attention_head_dim, + heads=num_attention_heads, + out_dim=dim, + context_pre_only=False, + bias=True, + processor=processor, + qk_norm="rms_norm", + ) + + self.norm2 = RMSNorm(dim, 1e-06, elementwise_affine=False) + self.norm2_context = RMSNorm(dim, 1e-06, elementwise_affine=False) + self.ff = GLUMBConv( + in_features=dim, + hidden_features=int(dim * mlp_ratio), + use_bias=(True, True, False), + norm=(None, None, None), + act=("silu", "silu", None), + ) + self.ff_context = GLUMBConv( + in_features=dim, + hidden_features=int(dim * mlp_ratio), + use_bias=(True, True, False), + norm=(None, None, None), + act=("silu", "silu", None), + ) + + def forward( + self, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor = None, + temb: torch.FloatTensor = None, + image_rotary_emb: Union[torch.Tensor, Tuple[torch.Tensor]] = None, + ): + if self.is_single: + residual = hidden_states + norm_hidden_states, gate = self.norm(hidden_states, emb=temb) + mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states)) + joint_attention_kwargs = joint_attention_kwargs or {} + attn_output = self.attn( + hidden_states=norm_hidden_states, + image_rotary_emb=image_rotary_emb, + **joint_attention_kwargs, + ) + hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2) + gate = gate.unsqueeze(1) + hidden_states = gate * self.proj_out(hidden_states) + hidden_states = residual + hidden_states + if hidden_states.dtype == torch.float16: + hidden_states = hidden_states.clip(-65504, 65504) + + return hidden_states + else: + N = hidden_states.shape[0] + + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( + self.scale_shift_table[None] + temb.reshape(N, 6, -1) + ).chunk(6, dim=1) + + norm_hidden_states = self.norm1(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa + + shift_msa_context, scale_msa_context, gate_msa_context, shift_mlp_context, scale_mlp_context, gate_mlp_context = ( + self.scale_shift_table_context[None] + temb.reshape(N, 6, -1) + ).chunk(6, dim=1) + + norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states) + norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + scale_msa_context) + shift_msa_context + + # Attention. + attn_output, context_attn_output = self.attn( + hidden_states=norm_hidden_states, + encoder_hidden_states=norm_encoder_hidden_states, + image_rotary_emb=image_rotary_emb, + ) + + # Process attention outputs for the `hidden_states`. + attn_output = gate_msa * attn_output + hidden_states = attn_output + hidden_states + + norm_hidden_states = self.norm2(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp + + ff_output = self.ff(norm_hidden_states) + ff_output = gate_mlp * ff_output + hidden_states = hidden_states + ff_output + + # Process attention outputs for the `encoder_hidden_states`. + context_attn_output = gate_msa_context * context_attn_output + encoder_hidden_states = encoder_hidden_states + context_attn_output + + norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states) + norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + scale_mlp_context) + shift_mlp_context + + context_ff_output = self.ff_context(norm_encoder_hidden_states) + context_ff_output = gate_mlp_context * context_ff_output + encoder_hidden_states = encoder_hidden_states + context_ff_output + return encoder_hidden_states, hidden_states diff --git a/models/config_sana_text2music_dcae_0225_3.5B_simple.json b/models/config_sana_text2music_dcae_0225_3.5B_simple.json new file mode 100644 index 0000000..e99ec4e --- /dev/null +++ b/models/config_sana_text2music_dcae_0225_3.5B_simple.json @@ -0,0 +1,23 @@ +{ + "_class_name": "Transformer2DModel", + "_diffusers_version": "0.27.2", + "in_channels": 8, + "num_layers": 24, + "inner_dim": 2560, + "attention_head_dim": 128, + "num_attention_heads": 20, + "mlp_ratio": 2.5, + "out_channels": 8, + "max_position": 32768, + "rope_theta": 1000000.0, + "speaker_embedding_dim": 512, + "text_embedding_dim": 768, + "ssl_encoder_depths": [8, 8], + "ssl_names": ["mert", "m-hubert"], + "ssl_latent_dims": [1024, 768], + "patch_size": [16, 1], + "max_height": 16, + "max_width": 32768, + "lyric_encoder_vocab_size": 6693, + "lyric_hidden_size": 1024 +} diff --git a/models/customer_attention_processor.py b/models/customer_attention_processor.py new file mode 100644 index 0000000..c451101 --- /dev/null +++ b/models/customer_attention_processor.py @@ -0,0 +1,1529 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import inspect +from importlib import import_module +from typing import Callable, Optional, Union, Tuple + +import torch +import torch.nn.functional as F +from torch import nn + +from diffusers.utils import logging +from diffusers.utils.import_utils import is_xformers_available +from diffusers.utils.torch_utils import maybe_allow_in_graph +import numbers +from diffusers.models.attention_processor import ( + AttentionProcessor, + AttnProcessor, + AttnProcessor2_0, + AttnProcessorNPU, + AttnAddedKVProcessor, + AttnAddedKVProcessor2_0, + SlicedAttnProcessor, + SlicedAttnAddedKVProcessor, + XFormersAttnProcessor, + LoRAAttnProcessor, + LoRAAttnProcessor2_0, + LoRAXFormersAttnProcessor, + CustomDiffusionAttnProcessor, + CustomDiffusionAttnProcessor2_0, + CustomDiffusionXFormersAttnProcessor, + XFormersAttnAddedKVProcessor, + LoRAAttnAddedKVProcessor, + SpatialNorm, + xformers +) + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +class RMSNorm(nn.Module): + def __init__(self, dim, eps: float, elementwise_affine: bool = True): + super().__init__() + + self.eps = eps + + if isinstance(dim, numbers.Integral): + dim = (dim,) + + self.dim = torch.Size(dim) + + if elementwise_affine: + self.weight = nn.Parameter(torch.ones(dim)) + else: + self.weight = None + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.eps) + + if self.weight is not None: + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) + hidden_states = hidden_states * self.weight + else: + hidden_states = hidden_states.to(input_dtype) + + return hidden_states + + +@maybe_allow_in_graph +class Attention(nn.Module): + r""" + A cross attention layer. + + Parameters: + query_dim (`int`): + The number of channels in the query. + cross_attention_dim (`int`, *optional*): + The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`. + heads (`int`, *optional*, defaults to 8): + The number of heads to use for multi-head attention. + dim_head (`int`, *optional*, defaults to 64): + The number of channels in each head. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability to use. + bias (`bool`, *optional*, defaults to False): + Set to `True` for the query, key, and value linear layers to contain a bias parameter. + upcast_attention (`bool`, *optional*, defaults to False): + Set to `True` to upcast the attention computation to `float32`. + upcast_softmax (`bool`, *optional*, defaults to False): + Set to `True` to upcast the softmax computation to `float32`. + cross_attention_norm (`str`, *optional*, defaults to `None`): + The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`. + cross_attention_norm_num_groups (`int`, *optional*, defaults to 32): + The number of groups to use for the group norm in the cross attention. + added_kv_proj_dim (`int`, *optional*, defaults to `None`): + The number of channels to use for the added key and value projections. If `None`, no projection is used. + norm_num_groups (`int`, *optional*, defaults to `None`): + The number of groups to use for the group norm in the attention. + spatial_norm_dim (`int`, *optional*, defaults to `None`): + The number of channels to use for the spatial normalization. + out_bias (`bool`, *optional*, defaults to `True`): + Set to `True` to use a bias in the output linear layer. + scale_qk (`bool`, *optional*, defaults to `True`): + Set to `True` to scale the query and key by `1 / sqrt(dim_head)`. + only_cross_attention (`bool`, *optional*, defaults to `False`): + Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if + `added_kv_proj_dim` is not `None`. + eps (`float`, *optional*, defaults to 1e-5): + An additional value added to the denominator in group normalization that is used for numerical stability. + rescale_output_factor (`float`, *optional*, defaults to 1.0): + A factor to rescale the output by dividing it with this value. + residual_connection (`bool`, *optional*, defaults to `False`): + Set to `True` to add the residual connection to the output. + _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`): + Set to `True` if the attention block is loaded from a deprecated state dict. + processor (`AttnProcessor`, *optional*, defaults to `None`): + The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and + `AttnProcessor` otherwise. + """ + + def __init__( + self, + query_dim: int, + cross_attention_dim: Optional[int] = None, + heads: int = 8, + dim_head: int = 64, + dropout: float = 0.0, + bias: bool = False, + upcast_attention: bool = False, + upcast_softmax: bool = False, + cross_attention_norm: Optional[str] = None, + cross_attention_norm_num_groups: int = 32, + qk_norm: Optional[str] = None, # [layer_norm, group_norm, rms_norm] + added_kv_proj_dim: Optional[int] = None, + norm_num_groups: Optional[int] = None, + spatial_norm_dim: Optional[int] = None, + out_bias: bool = True, + scale_qk: bool = True, + only_cross_attention: bool = False, + eps: float = 1e-5, + rescale_output_factor: float = 1.0, + residual_connection: bool = False, + _from_deprecated_attn_block: bool = False, + processor: Optional["AttnProcessor"] = None, + out_dim: int = None, + context_pre_only=None, + ): + super().__init__() + self.inner_dim = out_dim if out_dim is not None else dim_head * heads + self.query_dim = query_dim + self.use_bias = bias + self.is_cross_attention = cross_attention_dim is not None + self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim + self.upcast_attention = upcast_attention + self.upcast_softmax = upcast_softmax + self.rescale_output_factor = rescale_output_factor + self.residual_connection = residual_connection + self.dropout = dropout + self.fused_projections = False + self.out_dim = out_dim if out_dim is not None else query_dim + self.context_pre_only = context_pre_only + + # we make use of this private variable to know whether this class is loaded + # with an deprecated state dict so that we can convert it on the fly + self._from_deprecated_attn_block = _from_deprecated_attn_block + + self.scale_qk = scale_qk + self.scale = dim_head**-0.5 if self.scale_qk else 1.0 + + self.heads = out_dim // dim_head if out_dim is not None else heads + # for slice_size > 0 the attention score computation + # is split across the batch axis to save memory + # You can set slice_size with `set_attention_slice` + self.sliceable_head_dim = heads + + self.added_kv_proj_dim = added_kv_proj_dim + self.only_cross_attention = only_cross_attention + + if self.added_kv_proj_dim is None and self.only_cross_attention: + raise ValueError( + "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`." + ) + + if norm_num_groups is not None: + self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True) + else: + self.group_norm = None + + if spatial_norm_dim is not None: + self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim) + else: + self.spatial_norm = None + + if qk_norm is None: + self.norm_q = None + self.norm_k = None + elif qk_norm == "layer_norm": + self.norm_q = nn.LayerNorm(dim_head, eps=eps) + self.norm_k = nn.LayerNorm(dim_head, eps=eps) + elif qk_norm == "rms_norm": + self.norm_q = RMSNorm(dim_head, eps=eps) + self.norm_k = RMSNorm(dim_head, eps=eps) + else: + raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None or 'layer_norm' or 'rsm_norm'") + + if cross_attention_norm is None: + self.norm_cross = None + elif cross_attention_norm == "layer_norm": + self.norm_cross = nn.LayerNorm(self.cross_attention_dim) + elif cross_attention_norm == "group_norm": + if self.added_kv_proj_dim is not None: + # The given `encoder_hidden_states` are initially of shape + # (batch_size, seq_len, added_kv_proj_dim) before being projected + # to (batch_size, seq_len, cross_attention_dim). The norm is applied + # before the projection, so we need to use `added_kv_proj_dim` as + # the number of channels for the group norm. + norm_cross_num_channels = added_kv_proj_dim + else: + norm_cross_num_channels = self.cross_attention_dim + + self.norm_cross = nn.GroupNorm( + num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True + ) + else: + raise ValueError( + f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'" + ) + + self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias) + + if not self.only_cross_attention: + # only relevant for the `AddedKVProcessor` classes + self.to_k = nn.Linear(self.cross_attention_dim, self.inner_dim, bias=bias) + self.to_v = nn.Linear(self.cross_attention_dim, self.inner_dim, bias=bias) + else: + self.to_k = None + self.to_v = None + + if self.added_kv_proj_dim is not None: + self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_dim) + self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_dim) + if self.context_pre_only is not None: + self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim) + + self.to_out = nn.ModuleList([]) + self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias)) + self.to_out.append(nn.Dropout(dropout)) + + if self.context_pre_only is not None and not self.context_pre_only and self.added_kv_proj_dim is not None: + self.to_add_out = nn.Linear(self.inner_dim, self.out_dim, bias=out_bias) + + if qk_norm is not None and added_kv_proj_dim is not None: + if qk_norm == "fp32_layer_norm": + self.norm_added_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps) + self.norm_added_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps) + elif qk_norm == "rms_norm": + self.norm_added_q = RMSNorm(dim_head, eps=eps) + self.norm_added_k = RMSNorm(dim_head, eps=eps) + else: + raise ValueError( + f"unknown qk_norm: {qk_norm}. Should be one of `None,'layer_norm','fp32_layer_norm','rms_norm'`" + ) + else: + self.norm_added_q = None + self.norm_added_k = None + + # set attention processor + # We use the AttnProcessor2_0 by default when torch 2.x is used which uses + # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention + # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1 + if processor is None: + processor = ( + AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor() + ) + self.set_processor(processor) + + def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -> None: + r""" + Set whether to use npu flash attention from `torch_npu` or not. + + """ + if use_npu_flash_attention: + processor = AttnProcessorNPU() + else: + # set attention processor + # We use the AttnProcessor2_0 by default when torch 2.x is used which uses + # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention + # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1 + processor = ( + AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor() + ) + self.set_processor(processor) + + def set_use_memory_efficient_attention_xformers( + self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None + ) -> None: + r""" + Set whether to use memory efficient attention from `xformers` or not. + + Args: + use_memory_efficient_attention_xformers (`bool`): + Whether to use memory efficient attention from `xformers` or not. + attention_op (`Callable`, *optional*): + The attention operation to use. Defaults to `None` which uses the default attention operation from + `xformers`. + """ + # is_lora = hasattr(self, "processor") and isinstance( + # self.processor, + # LORA_ATTENTION_PROCESSORS, + # ) + is_custom_diffusion = hasattr(self, "processor") and isinstance( + self.processor, + (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor, CustomDiffusionAttnProcessor2_0), + ) + is_added_kv_processor = hasattr(self, "processor") and isinstance( + self.processor, + ( + AttnAddedKVProcessor, + AttnAddedKVProcessor2_0, + SlicedAttnAddedKVProcessor, + XFormersAttnAddedKVProcessor, + LoRAAttnAddedKVProcessor, + ), + ) + + if use_memory_efficient_attention_xformers: + if is_added_kv_processor and (is_lora or is_custom_diffusion): + raise NotImplementedError( + f"Memory efficient attention is currently not supported for LoRA or custom diffusion for attention processor type {self.processor}" + ) + if not is_xformers_available(): + raise ModuleNotFoundError( + ( + "Refer to https://github.com/facebookresearch/xformers for more information on how to install" + " xformers" + ), + name="xformers", + ) + elif not torch.cuda.is_available(): + raise ValueError( + "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is" + " only available for GPU " + ) + else: + try: + # Make sure we can run the memory efficient attention + _ = xformers.ops.memory_efficient_attention( + torch.randn((1, 2, 40), device="cuda"), + torch.randn((1, 2, 40), device="cuda"), + torch.randn((1, 2, 40), device="cuda"), + ) + except Exception as e: + raise e + + if is_lora: + # TODO (sayakpaul): should we throw a warning if someone wants to use the xformers + # variant when using PT 2.0 now that we have LoRAAttnProcessor2_0? + processor = LoRAXFormersAttnProcessor( + hidden_size=self.processor.hidden_size, + cross_attention_dim=self.processor.cross_attention_dim, + rank=self.processor.rank, + attention_op=attention_op, + ) + processor.load_state_dict(self.processor.state_dict()) + processor.to(self.processor.to_q_lora.up.weight.device) + elif is_custom_diffusion: + processor = CustomDiffusionXFormersAttnProcessor( + train_kv=self.processor.train_kv, + train_q_out=self.processor.train_q_out, + hidden_size=self.processor.hidden_size, + cross_attention_dim=self.processor.cross_attention_dim, + attention_op=attention_op, + ) + processor.load_state_dict(self.processor.state_dict()) + if hasattr(self.processor, "to_k_custom_diffusion"): + processor.to(self.processor.to_k_custom_diffusion.weight.device) + elif is_added_kv_processor: + # TODO(Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP + # which uses this type of cross attention ONLY because the attention mask of format + # [0, ..., -10.000, ..., 0, ...,] is not supported + # throw warning + logger.info( + "Memory efficient attention with `xformers` might currently not work correctly if an attention mask is required for the attention operation." + ) + processor = XFormersAttnAddedKVProcessor(attention_op=attention_op) + else: + processor = XFormersAttnProcessor(attention_op=attention_op) + else: + if is_lora: + attn_processor_class = ( + LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor + ) + processor = attn_processor_class( + hidden_size=self.processor.hidden_size, + cross_attention_dim=self.processor.cross_attention_dim, + rank=self.processor.rank, + ) + processor.load_state_dict(self.processor.state_dict()) + processor.to(self.processor.to_q_lora.up.weight.device) + elif is_custom_diffusion: + attn_processor_class = ( + CustomDiffusionAttnProcessor2_0 + if hasattr(F, "scaled_dot_product_attention") + else CustomDiffusionAttnProcessor + ) + processor = attn_processor_class( + train_kv=self.processor.train_kv, + train_q_out=self.processor.train_q_out, + hidden_size=self.processor.hidden_size, + cross_attention_dim=self.processor.cross_attention_dim, + ) + processor.load_state_dict(self.processor.state_dict()) + if hasattr(self.processor, "to_k_custom_diffusion"): + processor.to(self.processor.to_k_custom_diffusion.weight.device) + else: + # set attention processor + # We use the AttnProcessor2_0 by default when torch 2.x is used which uses + # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention + # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1 + processor = ( + AttnProcessor2_0() + if hasattr(F, "scaled_dot_product_attention") and self.scale_qk + else AttnProcessor() + ) + + self.set_processor(processor) + + def set_attention_slice(self, slice_size: int) -> None: + r""" + Set the slice size for attention computation. + + Args: + slice_size (`int`): + The slice size for attention computation. + """ + if slice_size is not None and slice_size > self.sliceable_head_dim: + raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.") + + if slice_size is not None and self.added_kv_proj_dim is not None: + processor = SlicedAttnAddedKVProcessor(slice_size) + elif slice_size is not None: + processor = SlicedAttnProcessor(slice_size) + elif self.added_kv_proj_dim is not None: + processor = AttnAddedKVProcessor() + else: + # set attention processor + # We use the AttnProcessor2_0 by default when torch 2.x is used which uses + # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention + # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1 + processor = ( + AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor() + ) + + self.set_processor(processor) + + def set_processor(self, processor: "AttnProcessor") -> None: + r""" + Set the attention processor to use. + + Args: + processor (`AttnProcessor`): + The attention processor to use. + """ + # if current processor is in `self._modules` and if passed `processor` is not, we need to + # pop `processor` from `self._modules` + if ( + hasattr(self, "processor") + and isinstance(self.processor, torch.nn.Module) + and not isinstance(processor, torch.nn.Module) + ): + logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}") + self._modules.pop("processor") + + self.processor = processor + + def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor": + r""" + Get the attention processor in use. + + Args: + return_deprecated_lora (`bool`, *optional*, defaults to `False`): + Set to `True` to return the deprecated LoRA attention processor. + + Returns: + "AttentionProcessor": The attention processor in use. + """ + if not return_deprecated_lora: + return self.processor + + # TODO(Sayak, Patrick). The rest of the function is needed to ensure backwards compatible + # serialization format for LoRA Attention Processors. It should be deleted once the integration + # with PEFT is completed. + is_lora_activated = { + name: module.lora_layer is not None + for name, module in self.named_modules() + if hasattr(module, "lora_layer") + } + + # 1. if no layer has a LoRA activated we can return the processor as usual + if not any(is_lora_activated.values()): + return self.processor + + # If doesn't apply LoRA do `add_k_proj` or `add_v_proj` + is_lora_activated.pop("add_k_proj", None) + is_lora_activated.pop("add_v_proj", None) + # 2. else it is not possible that only some layers have LoRA activated + if not all(is_lora_activated.values()): + raise ValueError( + f"Make sure that either all layers or no layers have LoRA activated, but have {is_lora_activated}" + ) + + # 3. And we need to merge the current LoRA layers into the corresponding LoRA attention processor + non_lora_processor_cls_name = self.processor.__class__.__name__ + lora_processor_cls = getattr(import_module(__name__), "LoRA" + non_lora_processor_cls_name) + + hidden_size = self.inner_dim + + # now create a LoRA attention processor from the LoRA layers + if lora_processor_cls in [LoRAAttnProcessor, LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor]: + kwargs = { + "cross_attention_dim": self.cross_attention_dim, + "rank": self.to_q.lora_layer.rank, + "network_alpha": self.to_q.lora_layer.network_alpha, + "q_rank": self.to_q.lora_layer.rank, + "q_hidden_size": self.to_q.lora_layer.out_features, + "k_rank": self.to_k.lora_layer.rank, + "k_hidden_size": self.to_k.lora_layer.out_features, + "v_rank": self.to_v.lora_layer.rank, + "v_hidden_size": self.to_v.lora_layer.out_features, + "out_rank": self.to_out[0].lora_layer.rank, + "out_hidden_size": self.to_out[0].lora_layer.out_features, + } + + if hasattr(self.processor, "attention_op"): + kwargs["attention_op"] = self.processor.attention_op + + lora_processor = lora_processor_cls(hidden_size, **kwargs) + lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict()) + lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict()) + lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict()) + lora_processor.to_out_lora.load_state_dict(self.to_out[0].lora_layer.state_dict()) + elif lora_processor_cls == LoRAAttnAddedKVProcessor: + lora_processor = lora_processor_cls( + hidden_size, + cross_attention_dim=self.add_k_proj.weight.shape[0], + rank=self.to_q.lora_layer.rank, + network_alpha=self.to_q.lora_layer.network_alpha, + ) + lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict()) + lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict()) + lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict()) + lora_processor.to_out_lora.load_state_dict(self.to_out[0].lora_layer.state_dict()) + + # only save if used + if self.add_k_proj.lora_layer is not None: + lora_processor.add_k_proj_lora.load_state_dict(self.add_k_proj.lora_layer.state_dict()) + lora_processor.add_v_proj_lora.load_state_dict(self.add_v_proj.lora_layer.state_dict()) + else: + lora_processor.add_k_proj_lora = None + lora_processor.add_v_proj_lora = None + else: + raise ValueError(f"{lora_processor_cls} does not exist.") + + return lora_processor + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + **cross_attention_kwargs, + ) -> torch.Tensor: + r""" + The forward method of the `Attention` class. + + Args: + hidden_states (`torch.Tensor`): + The hidden states of the query. + encoder_hidden_states (`torch.Tensor`, *optional*): + The hidden states of the encoder. + attention_mask (`torch.Tensor`, *optional*): + The attention mask to use. If `None`, no mask is applied. + **cross_attention_kwargs: + Additional keyword arguments to pass along to the cross attention. + + Returns: + `torch.Tensor`: The output of the attention layer. + """ + # The `Attention` class can call different attention processors / attention functions + # here we simply pass along all tensors to the selected processor class + # For standard processors that are defined here, `**cross_attention_kwargs` is empty + + attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys()) + quiet_attn_parameters = {"ip_adapter_masks"} + unused_kwargs = [ + k for k, _ in cross_attention_kwargs.items() if k not in attn_parameters and k not in quiet_attn_parameters + ] + if len(unused_kwargs) > 0: + logger.warning( + f"cross_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored." + ) + cross_attention_kwargs = {k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters} + + return self.processor( + self, + hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + + def batch_to_head_dim(self, tensor: torch.Tensor) -> torch.Tensor: + r""" + Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size // heads, seq_len, dim * heads]`. `heads` + is the number of heads initialized while constructing the `Attention` class. + + Args: + tensor (`torch.Tensor`): The tensor to reshape. + + Returns: + `torch.Tensor`: The reshaped tensor. + """ + head_size = self.heads + batch_size, seq_len, dim = tensor.shape + tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim) + tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size) + return tensor + + def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) -> torch.Tensor: + r""" + Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size, seq_len, heads, dim // heads]` `heads` is + the number of heads initialized while constructing the `Attention` class. + + Args: + tensor (`torch.Tensor`): The tensor to reshape. + out_dim (`int`, *optional*, defaults to `3`): The output dimension of the tensor. If `3`, the tensor is + reshaped to `[batch_size * heads, seq_len, dim // heads]`. + + Returns: + `torch.Tensor`: The reshaped tensor. + """ + head_size = self.heads + if tensor.ndim == 3: + batch_size, seq_len, dim = tensor.shape + extra_dim = 1 + else: + batch_size, extra_dim, seq_len, dim = tensor.shape + tensor = tensor.reshape(batch_size, seq_len * extra_dim, head_size, dim // head_size) + tensor = tensor.permute(0, 2, 1, 3) + + if out_dim == 3: + tensor = tensor.reshape(batch_size * head_size, seq_len * extra_dim, dim // head_size) + + return tensor + + def get_attention_scores( + self, query: torch.Tensor, key: torch.Tensor, attention_mask: torch.Tensor = None + ) -> torch.Tensor: + r""" + Compute the attention scores. + + Args: + query (`torch.Tensor`): The query tensor. + key (`torch.Tensor`): The key tensor. + attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied. + + Returns: + `torch.Tensor`: The attention probabilities/scores. + """ + dtype = query.dtype + if self.upcast_attention: + query = query.float() + key = key.float() + + if attention_mask is None: + baddbmm_input = torch.empty( + query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device + ) + beta = 0 + else: + baddbmm_input = attention_mask + beta = 1 + + attention_scores = torch.baddbmm( + baddbmm_input, + query, + key.transpose(-1, -2), + beta=beta, + alpha=self.scale, + ) + del baddbmm_input + + if self.upcast_softmax: + attention_scores = attention_scores.float() + + attention_probs = attention_scores.softmax(dim=-1) + del attention_scores + + attention_probs = attention_probs.to(dtype) + + return attention_probs + + def prepare_attention_mask( + self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3 + ) -> torch.Tensor: + r""" + Prepare the attention mask for the attention computation. + + Args: + attention_mask (`torch.Tensor`): + The attention mask to prepare. + target_length (`int`): + The target length of the attention mask. This is the length of the attention mask after padding. + batch_size (`int`): + The batch size, which is used to repeat the attention mask. + out_dim (`int`, *optional*, defaults to `3`): + The output dimension of the attention mask. Can be either `3` or `4`. + + Returns: + `torch.Tensor`: The prepared attention mask. + """ + head_size = self.heads + if attention_mask is None: + return attention_mask + + current_length: int = attention_mask.shape[-1] + if current_length != target_length: + if attention_mask.device.type == "mps": + # HACK: MPS: Does not support padding by greater than dimension of input tensor. + # Instead, we can manually construct the padding tensor. + padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length) + padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device) + attention_mask = torch.cat([attention_mask, padding], dim=2) + else: + # TODO: for pipelines such as stable-diffusion, padding cross-attn mask: + # we want to instead pad by (0, remaining_length), where remaining_length is: + # remaining_length: int = target_length - current_length + # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding + attention_mask = F.pad(attention_mask, (0, target_length), value=0.0) + + if out_dim == 3: + if attention_mask.shape[0] < batch_size * head_size: + attention_mask = attention_mask.repeat_interleave(head_size, dim=0) + elif out_dim == 4: + attention_mask = attention_mask.unsqueeze(1) + attention_mask = attention_mask.repeat_interleave(head_size, dim=1) + + return attention_mask + + def norm_encoder_hidden_states(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor: + r""" + Normalize the encoder hidden states. Requires `self.norm_cross` to be specified when constructing the + `Attention` class. + + Args: + encoder_hidden_states (`torch.Tensor`): Hidden states of the encoder. + + Returns: + `torch.Tensor`: The normalized encoder hidden states. + """ + assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states" + + if isinstance(self.norm_cross, nn.LayerNorm): + encoder_hidden_states = self.norm_cross(encoder_hidden_states) + elif isinstance(self.norm_cross, nn.GroupNorm): + # Group norm norms along the channels dimension and expects + # input to be in the shape of (N, C, *). In this case, we want + # to norm along the hidden dimension, so we need to move + # (batch_size, sequence_length, hidden_size) -> + # (batch_size, hidden_size, sequence_length) + encoder_hidden_states = encoder_hidden_states.transpose(1, 2) + encoder_hidden_states = self.norm_cross(encoder_hidden_states) + encoder_hidden_states = encoder_hidden_states.transpose(1, 2) + else: + assert False + + return encoder_hidden_states + + @torch.no_grad() + def fuse_projections(self, fuse=True): + device = self.to_q.weight.data.device + dtype = self.to_q.weight.data.dtype + + if not self.is_cross_attention: + # fetch weight matrices. + concatenated_weights = torch.cat([self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data]) + in_features = concatenated_weights.shape[1] + out_features = concatenated_weights.shape[0] + + # create a new single projection layer and copy over the weights. + self.to_qkv = nn.Linear(in_features, out_features, bias=self.use_bias, device=device, dtype=dtype) + self.to_qkv.weight.copy_(concatenated_weights) + if self.use_bias: + concatenated_bias = torch.cat([self.to_q.bias.data, self.to_k.bias.data, self.to_v.bias.data]) + self.to_qkv.bias.copy_(concatenated_bias) + + else: + concatenated_weights = torch.cat([self.to_k.weight.data, self.to_v.weight.data]) + in_features = concatenated_weights.shape[1] + out_features = concatenated_weights.shape[0] + + self.to_kv = nn.Linear(in_features, out_features, bias=self.use_bias, device=device, dtype=dtype) + self.to_kv.weight.copy_(concatenated_weights) + if self.use_bias: + concatenated_bias = torch.cat([self.to_k.bias.data, self.to_v.bias.data]) + self.to_kv.bias.copy_(concatenated_bias) + + self.fused_projections = fuse + + +class CustomJointAttnProcessor2_0: + """Attention processor used typically in processing the SD3-like self-attention projections. add rms norm for query and key and apply RoPE""" + + def __init__(self): + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") + + def apply_rotary_emb( + self, + x: torch.Tensor, + freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]], + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings + to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are + reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting + tensors contain rotary embeddings and are returned as real tensors. + + Args: + x (`torch.Tensor`): + Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply + freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],) + + Returns: + Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings. + """ + cos, sin = freqs_cis # [S, D] + cos = cos[None, None] + sin = sin[None, None] + cos, sin = cos.to(x.device), sin.to(x.device) + + x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1) # [B, S, H, D//2] + x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3) + out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype) + + return out + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor = None, + attention_mask: Optional[torch.FloatTensor] = None, + rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None, + rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None, + *args, + **kwargs, + ) -> torch.FloatTensor: + hidden_states_len = hidden_states.shape[1] + + input_ndim = hidden_states.ndim + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + if encoder_hidden_states is not None: + context_input_ndim = encoder_hidden_states.ndim + if context_input_ndim == 4: + batch_size, channel, height, width = encoder_hidden_states.shape + encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size = hidden_states.shape[0] + + # `sample` projections. + query = attn.to_q(hidden_states) + key = attn.to_k(hidden_states) + value = attn.to_v(hidden_states) + + # `context` projections. + if encoder_hidden_states is not None: + encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states) + encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + + # attention + if not attn.is_cross_attention: + query = torch.cat([query, encoder_hidden_states_query_proj], dim=1) + key = torch.cat([key, encoder_hidden_states_key_proj], dim=1) + value = torch.cat([value, encoder_hidden_states_value_proj], dim=1) + else: + query = hidden_states + key = encoder_hidden_states + value = encoder_hidden_states + + inner_dim = key.shape[-1] + head_dim = inner_dim // attn.heads + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + # Apply query and key normalization if needed + if attn.norm_q is not None: + query = attn.norm_q(query) + if attn.norm_k is not None: + key = attn.norm_k(key) + + # Apply RoPE if needed + if rotary_freqs_cis is not None: + query = self.apply_rotary_emb(query, rotary_freqs_cis) + if not attn.is_cross_attention: + key = self.apply_rotary_emb(key, rotary_freqs_cis) + elif rotary_freqs_cis_cross is not None: + key = self.apply_rotary_emb(key, rotary_freqs_cis_cross) + + hidden_states = F.scaled_dot_product_attention( + query, key, value, dropout_p=0.0, is_causal=False + ) + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(query.dtype) + + # Split the attention outputs. + if encoder_hidden_states is not None and not attn.is_cross_attention: + hidden_states, encoder_hidden_states = ( + hidden_states[:, : hidden_states_len], + hidden_states[:, hidden_states_len:], + ) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + if encoder_hidden_states is not None and not attn.context_pre_only: + encoder_hidden_states = attn.to_add_out(encoder_hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + if encoder_hidden_states is not None and context_input_ndim == 4: + encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + return hidden_states, encoder_hidden_states + + +class CustomLiteLAProcessor2_0: + """Attention processor used typically in processing the SD3-like self-attention projections. add rms norm for query and key and apply RoPE""" + + def __init__(self): + self.kernel_func = nn.ReLU(inplace=False) + self.eps = 1e-15 + self.pad_val = 1.0 + + def apply_rotary_emb( + self, + x: torch.Tensor, + freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]], + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings + to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are + reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting + tensors contain rotary embeddings and are returned as real tensors. + + Args: + x (`torch.Tensor`): + Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply + freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],) + + Returns: + Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings. + """ + cos, sin = freqs_cis # [S, D] + cos = cos[None, None] + sin = sin[None, None] + cos, sin = cos.to(x.device), sin.to(x.device) + + x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1) # [B, S, H, D//2] + x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3) + out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype) + + return out + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor = None, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None, + rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None, + *args, + **kwargs, + ) -> torch.FloatTensor: + hidden_states_len = hidden_states.shape[1] + + input_ndim = hidden_states.ndim + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + if encoder_hidden_states is not None: + context_input_ndim = encoder_hidden_states.ndim + if context_input_ndim == 4: + batch_size, channel, height, width = encoder_hidden_states.shape + encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size = hidden_states.shape[0] + + # `sample` projections. + dtype = hidden_states.dtype + query = attn.to_q(hidden_states) + key = attn.to_k(hidden_states) + value = attn.to_v(hidden_states) + + # `context` projections. + has_encoder_hidden_state_proj = hasattr(attn, "add_q_proj") and hasattr(attn, "add_k_proj") and hasattr(attn, "add_v_proj") + if encoder_hidden_states is not None and has_encoder_hidden_state_proj: + encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states) + encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + + # attention + if not attn.is_cross_attention: + query = torch.cat([query, encoder_hidden_states_query_proj], dim=1) + key = torch.cat([key, encoder_hidden_states_key_proj], dim=1) + value = torch.cat([value, encoder_hidden_states_value_proj], dim=1) + else: + query = hidden_states + key = encoder_hidden_states + value = encoder_hidden_states + + inner_dim = key.shape[-1] + head_dim = inner_dim // attn.heads + + query = query.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1) + key = key.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1).transpose(-1, -2) + value = value.transpose(-1, -2).reshape(batch_size, attn.heads, head_dim, -1) + + # RoPE需要 [B, H, S, D] 输入 + # 此时 query是 [B, H, D, S], 需要转成 [B, H, S, D] 才能应用RoPE + query = query.permute(0, 1, 3, 2) # [B, H, S, D] (从 [B, H, D, S]) + + # Apply query and key normalization if needed + if attn.norm_q is not None: + query = attn.norm_q(query) + if attn.norm_k is not None: + key = attn.norm_k(key) + + # Apply RoPE if needed + if rotary_freqs_cis is not None: + query = self.apply_rotary_emb(query, rotary_freqs_cis) + if not attn.is_cross_attention: + key = self.apply_rotary_emb(key, rotary_freqs_cis) + elif rotary_freqs_cis_cross is not None and has_encoder_hidden_state_proj: + key = self.apply_rotary_emb(key, rotary_freqs_cis_cross) + + # 此时 query是 [B, H, S, D],需要还原成 [B, H, D, S] + query = query.permute(0, 1, 3, 2) # [B, H, D, S] + + if attention_mask is not None: + # attention_mask: [B, S] -> [B, 1, S, 1] + attention_mask = attention_mask[:, None, :, None].to(key.dtype) # [B, 1, S, 1] + query = query * attention_mask.permute(0, 1, 3, 2) # [B, H, S, D] * [B, 1, S, 1] + if not attn.is_cross_attention: + key = key * attention_mask # key: [B, h, S, D] 与 mask [B, 1, S, 1] 相乘 + value = value * attention_mask.permute(0, 1, 3, 2) # 如果 value 是 [B, h, D, S],那么需调整mask以匹配S维度 + + if attn.is_cross_attention and encoder_attention_mask is not None and has_encoder_hidden_state_proj: + encoder_attention_mask = encoder_attention_mask[:, None, :, None].to(key.dtype) # [B, 1, S_enc, 1] + # 此时 key: [B, h, S_enc, D], value: [B, h, D, S_enc] + key = key * encoder_attention_mask # [B, h, S_enc, D] * [B, 1, S_enc, 1] + value = value * encoder_attention_mask.permute(0, 1, 3, 2) # [B, h, D, S_enc] * [B, 1, 1, S_enc] + + query = self.kernel_func(query) + key = self.kernel_func(key) + + query, key, value = query.float(), key.float(), value.float() + + value = F.pad(value, (0, 0, 0, 1), mode="constant", value=self.pad_val) + + vk = torch.matmul(value, key) + + hidden_states = torch.matmul(vk, query) + + if hidden_states.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.float() + + hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps) + + hidden_states = hidden_states.view(batch_size, attn.heads * head_dim, -1).permute(0, 2, 1) + + hidden_states = hidden_states.to(dtype) + if encoder_hidden_states is not None: + encoder_hidden_states = encoder_hidden_states.to(dtype) + + # Split the attention outputs. + if encoder_hidden_states is not None and not attn.is_cross_attention and has_encoder_hidden_state_proj: + hidden_states, encoder_hidden_states = ( + hidden_states[:, : hidden_states_len], + hidden_states[:, hidden_states_len:], + ) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + if encoder_hidden_states is not None and not attn.context_pre_only and not attn.is_cross_attention and hasattr(attn, "to_add_out"): + encoder_hidden_states = attn.to_add_out(encoder_hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + if encoder_hidden_states is not None and context_input_ndim == 4: + encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if torch.get_autocast_gpu_dtype() == torch.float16: + hidden_states = hidden_states.clip(-65504, 65504) + if encoder_hidden_states is not None: + encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504) + + return hidden_states, encoder_hidden_states + + + +class CustomerAttnProcessor2_0: + r""" + Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). + """ + + def __init__(self): + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") + + def apply_rotary_emb( + self, + x: torch.Tensor, + freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]], + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings + to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are + reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting + tensors contain rotary embeddings and are returned as real tensors. + + Args: + x (`torch.Tensor`): + Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply + freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],) + + Returns: + Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings. + """ + cos, sin = freqs_cis # [S, D] + cos = cos[None, None] + sin = sin[None, None] + cos, sin = cos.to(x.device), sin.to(x.device) + + x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1) # [B, S, H, D//2] + x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3) + out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype) + + return out + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor = None, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None, + rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None, + *args, + **kwargs, + ) -> torch.Tensor: + + residual = hidden_states + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + + has_encoder_hidden_state_proj = hasattr(attn, "add_q_proj") and hasattr(attn, "add_k_proj") and hasattr(attn, "add_v_proj") + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + inner_dim = key.shape[-1] + head_dim = inner_dim // attn.heads + + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + if attn.norm_q is not None: + query = attn.norm_q(query) + if attn.norm_k is not None: + key = attn.norm_k(key) + + # Apply RoPE if needed + if rotary_freqs_cis is not None: + query = self.apply_rotary_emb(query, rotary_freqs_cis) + if not attn.is_cross_attention: + key = self.apply_rotary_emb(key, rotary_freqs_cis) + elif rotary_freqs_cis_cross is not None and has_encoder_hidden_state_proj: + key = self.apply_rotary_emb(key, rotary_freqs_cis_cross) + + if attn.is_cross_attention and encoder_attention_mask is not None and has_encoder_hidden_state_proj: + # attention_mask: N x S1 + # encoder_attention_mask: N x S2 + # cross attention 整合attention_mask和encoder_attention_mask + combined_mask = attention_mask[:, :, None] * encoder_attention_mask[:, None, :] + attention_mask = torch.where(combined_mask == 1, 0.0, -torch.inf) + attention_mask = attention_mask[:, None, :, :].expand(-1, attn.heads, -1, -1).to(query.dtype) + + elif not attn.is_cross_attention and attention_mask is not None: + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + # scaled_dot_product_attention expects attention_mask shape to be + # (batch, heads, source_length, target_length) + attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) + + # the output of sdp = (batch, num_heads, seq_len, head_dim) + # TODO: add support for attn.scale when we move to Torch 2.1 + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(query.dtype) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + + +class CustomLiteLAMMDiTProcessor2_0: + """Attention processor used typically in processing the SD3-like self-attention projections. add rms norm for query and key and apply RoPE""" + + def __init__(self): + self.kernel_func = nn.ReLU(inplace=False) + self.eps = 1e-15 + self.pad_val = 1.0 + + def apply_linear_attention(self, query, key, value): + key = key.permute(0, 1, 3, 2) + query = self.kernel_func(query) + key = self.kernel_func(key) + + query, key, value = query.float(), key.float(), value.float() + + value = F.pad(value, (0, 0, 0, 1), mode="constant", value=self.pad_val) + + vk = torch.matmul(value, key) + + hidden_states = torch.matmul(vk, query) + + if hidden_states.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.float() + + hidden_states = hidden_states[:, :, :-1] / (hidden_states[:, :, -1:] + self.eps) + + return hidden_states + + def apply_rotary_emb( + self, + x: torch.Tensor, + freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]], + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings + to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are + reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting + tensors contain rotary embeddings and are returned as real tensors. + + Args: + x (`torch.Tensor`): + Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply + freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],) + + Returns: + Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings. + """ + cos, sin = freqs_cis # [S, D] or [N, S, D] + if cos.ndim == 2: + cos = cos[None, None] + sin = sin[None, None] + elif cos.ndim == 3: + cos = cos[:, None] + sin = sin[:, None] + cos, sin = cos.to(x.device), sin.to(x.device) + + x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1) # [B, S, H, D//2] + x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3) + out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype) + + return out + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor = None, + attention_mask: Optional[torch.FloatTensor] = None, + image_rotary_emb: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + + batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + + # `sample` projections. + dtype = hidden_states.dtype + query = attn.to_q(hidden_states) + key = attn.to_k(hidden_states) + value = attn.to_v(hidden_states) + + inner_dim = key.shape[-1] + head_dim = inner_dim // attn.heads + + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + if attn.norm_q is not None: + query = attn.norm_q(query) + if attn.norm_k is not None: + key = attn.norm_k(key) + + # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states` + if encoder_hidden_states is not None: + # `context` projections. + encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states) + encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + + encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view( + batch_size, -1, attn.heads, head_dim + ).transpose(1, 2) + encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view( + batch_size, -1, attn.heads, head_dim + ).transpose(1, 2) + encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view( + batch_size, -1, attn.heads, head_dim + ).transpose(1, 2) + + if attn.norm_added_q is not None: + encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj) + if attn.norm_added_k is not None: + encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj) + + # attention + query = torch.cat([encoder_hidden_states_query_proj, query], dim=2) + key = torch.cat([encoder_hidden_states_key_proj, key], dim=2) + value = torch.cat([encoder_hidden_states_value_proj, value], dim=2) + + if image_rotary_emb is not None: + query = self.apply_rotary_emb(query, image_rotary_emb) + key = self.apply_rotary_emb(key, image_rotary_emb) + + # hidden_states = F.scaled_dot_product_attention( + # query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + # ) + # apply linear attention + hidden_states = self.apply_linear_attention(query, key, value) + + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(dtype) + + if encoder_hidden_states is not None: + encoder_hidden_states, hidden_states = ( + hidden_states[:, : encoder_hidden_states.shape[1]], + hidden_states[:, encoder_hidden_states.shape[1] :], + ) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + encoder_hidden_states = attn.to_add_out(encoder_hidden_states) + + return hidden_states, encoder_hidden_states + else: + return hidden_states + + +class FluxAttnProcessor2_0: + """Attention processor used typically in processing the SD3-like self-attention projections.""" + + def __init__(self): + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError("FluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") + + def apply_rotary_emb( + self, + x: torch.Tensor, + freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] + ): + cos, sin = freqs_cis # [S, D] or [N, S, D] + if cos.ndim == 2: + cos = cos[None, None] + sin = sin[None, None] + elif cos.ndim == 3: + cos = cos[:, None] + sin = sin[:, None] + cos, sin = cos.to(x.device), sin.to(x.device) + # Used for flux, cogvideox, hunyuan-dit + x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1) # [B, S, H, D//2] + x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3) + out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype) + + return out + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor = None, + attention_mask: Optional[torch.FloatTensor] = None, + image_rotary_emb: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + + # `sample` projections. + query = attn.to_q(hidden_states) + key = attn.to_k(hidden_states) + value = attn.to_v(hidden_states) + + inner_dim = key.shape[-1] + head_dim = inner_dim // attn.heads + + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + if attn.norm_q is not None: + query = attn.norm_q(query) + if attn.norm_k is not None: + key = attn.norm_k(key) + + # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states` + if encoder_hidden_states is not None: + # `context` projections. + encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states) + encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + + encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view( + batch_size, -1, attn.heads, head_dim + ).transpose(1, 2) + encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view( + batch_size, -1, attn.heads, head_dim + ).transpose(1, 2) + encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view( + batch_size, -1, attn.heads, head_dim + ).transpose(1, 2) + + if attn.norm_added_q is not None: + encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj) + if attn.norm_added_k is not None: + encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj) + + # attention + query = torch.cat([encoder_hidden_states_query_proj, query], dim=2) + key = torch.cat([encoder_hidden_states_key_proj, key], dim=2) + value = torch.cat([encoder_hidden_states_value_proj, value], dim=2) + + if image_rotary_emb is not None: + query = self.apply_rotary_emb(query, image_rotary_emb) + key = self.apply_rotary_emb(key, image_rotary_emb) + + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(query.dtype) + + if encoder_hidden_states is not None: + encoder_hidden_states, hidden_states = ( + hidden_states[:, : encoder_hidden_states.shape[1]], + hidden_states[:, encoder_hidden_states.shape[1] :], + ) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + encoder_hidden_states = attn.to_add_out(encoder_hidden_states) + + return hidden_states, encoder_hidden_states + else: + return hidden_states + diff --git a/models/lyrics_utils/lyric_encoder.py b/models/lyrics_utils/lyric_encoder.py new file mode 100644 index 0000000..3b80aeb --- /dev/null +++ b/models/lyrics_utils/lyric_encoder.py @@ -0,0 +1,1070 @@ +from typing import Optional, Tuple, Union +import math +import torch +from torch import nn + +class ConvolutionModule(nn.Module): + """ConvolutionModule in Conformer model.""" + + def __init__(self, + channels: int, + kernel_size: int = 15, + activation: nn.Module = nn.ReLU(), + norm: str = "batch_norm", + causal: bool = False, + bias: bool = True): + """Construct an ConvolutionModule object. + Args: + channels (int): The number of channels of conv layers. + kernel_size (int): Kernel size of conv layers. + causal (int): Whether use causal convolution or not + """ + super().__init__() + + self.pointwise_conv1 = nn.Conv1d( + channels, + 2 * channels, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + # self.lorder is used to distinguish if it's a causal convolution, + # if self.lorder > 0: it's a causal convolution, the input will be + # padded with self.lorder frames on the left in forward. + # else: it's a symmetrical convolution + if causal: + padding = 0 + self.lorder = kernel_size - 1 + else: + # kernel_size should be an odd number for none causal convolution + assert (kernel_size - 1) % 2 == 0 + padding = (kernel_size - 1) // 2 + self.lorder = 0 + self.depthwise_conv = nn.Conv1d( + channels, + channels, + kernel_size, + stride=1, + padding=padding, + groups=channels, + bias=bias, + ) + + assert norm in ['batch_norm', 'layer_norm'] + if norm == "batch_norm": + self.use_layer_norm = False + self.norm = nn.BatchNorm1d(channels) + else: + self.use_layer_norm = True + self.norm = nn.LayerNorm(channels) + + self.pointwise_conv2 = nn.Conv1d( + channels, + channels, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + self.activation = activation + + def forward( + self, + x: torch.Tensor, + mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + cache: torch.Tensor = torch.zeros((0, 0, 0)), + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute convolution module. + Args: + x (torch.Tensor): Input tensor (#batch, time, channels). + mask_pad (torch.Tensor): used for batch padding (#batch, 1, time), + (0, 0, 0) means fake mask. + cache (torch.Tensor): left context cache, it is only + used in causal convolution (#batch, channels, cache_t), + (0, 0, 0) meas fake cache. + Returns: + torch.Tensor: Output tensor (#batch, time, channels). + """ + # exchange the temporal dimension and the feature dimension + x = x.transpose(1, 2) # (#batch, channels, time) + + # mask batch padding + if mask_pad.size(2) > 0: # time > 0 + x.masked_fill_(~mask_pad, 0.0) + + if self.lorder > 0: + if cache.size(2) == 0: # cache_t == 0 + x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0) + else: + assert cache.size(0) == x.size(0) # equal batch + assert cache.size(1) == x.size(1) # equal channel + x = torch.cat((cache, x), dim=2) + assert (x.size(2) > self.lorder) + new_cache = x[:, :, -self.lorder:] + else: + # It's better we just return None if no cache is required, + # However, for JIT export, here we just fake one tensor instead of + # None. + new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) + + # GLU mechanism + x = self.pointwise_conv1(x) # (batch, 2*channel, dim) + x = nn.functional.glu(x, dim=1) # (batch, channel, dim) + + # 1D Depthwise Conv + x = self.depthwise_conv(x) + if self.use_layer_norm: + x = x.transpose(1, 2) + x = self.activation(self.norm(x)) + if self.use_layer_norm: + x = x.transpose(1, 2) + x = self.pointwise_conv2(x) + # mask batch padding + if mask_pad.size(2) > 0: # time > 0 + x.masked_fill_(~mask_pad, 0.0) + + return x.transpose(1, 2), new_cache + +class PositionwiseFeedForward(torch.nn.Module): + """Positionwise feed forward layer. + + FeedForward are appied on each position of the sequence. + The output dim is same with the input dim. + + Args: + idim (int): Input dimenstion. + hidden_units (int): The number of hidden units. + dropout_rate (float): Dropout rate. + activation (torch.nn.Module): Activation function + """ + + def __init__( + self, + idim: int, + hidden_units: int, + dropout_rate: float, + activation: torch.nn.Module = torch.nn.ReLU(), + ): + """Construct a PositionwiseFeedForward object.""" + super(PositionwiseFeedForward, self).__init__() + self.w_1 = torch.nn.Linear(idim, hidden_units) + self.activation = activation + self.dropout = torch.nn.Dropout(dropout_rate) + self.w_2 = torch.nn.Linear(hidden_units, idim) + + def forward(self, xs: torch.Tensor) -> torch.Tensor: + """Forward function. + + Args: + xs: input tensor (B, L, D) + Returns: + output tensor, (B, L, D) + """ + return self.w_2(self.dropout(self.activation(self.w_1(xs)))) + +class Swish(torch.nn.Module): + """Construct an Swish object.""" + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Return Swish activation function.""" + return x * torch.sigmoid(x) + +class MultiHeadedAttention(nn.Module): + """Multi-Head Attention layer. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, + n_head: int, + n_feat: int, + dropout_rate: float, + key_bias: bool = True): + """Construct an MultiHeadedAttention object.""" + super().__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + self.linear_q = nn.Linear(n_feat, n_feat) + self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias) + self.linear_v = nn.Linear(n_feat, n_feat) + self.linear_out = nn.Linear(n_feat, n_feat) + self.dropout = nn.Dropout(p=dropout_rate) + + def forward_qkv( + self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Transform query, key and value. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + + Returns: + torch.Tensor: Transformed query tensor, size + (#batch, n_head, time1, d_k). + torch.Tensor: Transformed key tensor, size + (#batch, n_head, time2, d_k). + torch.Tensor: Transformed value tensor, size + (#batch, n_head, time2, d_k). + + """ + n_batch = query.size(0) + q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) + k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) + v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) + q = q.transpose(1, 2) # (batch, head, time1, d_k) + k = k.transpose(1, 2) # (batch, head, time2, d_k) + v = v.transpose(1, 2) # (batch, head, time2, d_k) + return q, k, v + + def forward_attention( + self, + value: torch.Tensor, + scores: torch.Tensor, + mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool) + ) -> torch.Tensor: + """Compute attention context vector. + + Args: + value (torch.Tensor): Transformed value, size + (#batch, n_head, time2, d_k). + scores (torch.Tensor): Attention score, size + (#batch, n_head, time1, time2). + mask (torch.Tensor): Mask, size (#batch, 1, time2) or + (#batch, time1, time2), (0, 0, 0) means fake mask. + + Returns: + torch.Tensor: Transformed value (#batch, time1, d_model) + weighted by the attention score (#batch, time1, time2). + + """ + n_batch = value.size(0) + + if mask.size(2) > 0: # time2 > 0 + mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) + # For last chunk, time2 might be larger than scores.size(-1) + mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2) + scores = scores.masked_fill(mask, -float('inf')) + attn = torch.softmax(scores, dim=-1).masked_fill( + mask, 0.0) # (batch, head, time1, time2) + + else: + attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + + p_attn = self.dropout(attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = (x.transpose(1, 2).contiguous().view(n_batch, -1, + self.h * self.d_k) + ) # (batch, time1, d_model) + + return self.linear_out(x) # (batch, time1, d_model) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + pos_emb: torch.Tensor = torch.empty(0), + cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + 1.When applying cross attention between decoder and encoder, + the batch padding mask for input is in (#batch, 1, T) shape. + 2.When applying self attention of encoder, + the mask is in (#batch, T, T) shape. + 3.When applying self attention of decoder, + the mask is in (#batch, L, L) shape. + 4.If the different position in decoder see different block + of the encoder, such as Mocha, the passed in mask could be + in (#batch, L, T) shape. But there is no such case in current + CosyVoice. + cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + + """ + q, k, v = self.forward_qkv(query, key, value) + if cache.size(0) > 0: + key_cache, value_cache = torch.split(cache, + cache.size(-1) // 2, + dim=-1) + k = torch.cat([key_cache, k], dim=2) + v = torch.cat([value_cache, v], dim=2) + new_cache = torch.cat((k, v), dim=-1) + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + return self.forward_attention(v, scores, mask), new_cache + + +class RelPositionMultiHeadedAttention(MultiHeadedAttention): + """Multi-Head Attention layer with relative position encoding. + Paper: https://arxiv.org/abs/1901.02860 + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + """ + + def __init__(self, + n_head: int, + n_feat: int, + dropout_rate: float, + key_bias: bool = True): + """Construct an RelPositionMultiHeadedAttention object.""" + super().__init__(n_head, n_feat, dropout_rate, key_bias) + # linear transformation for positional encoding + self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k)) + self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k)) + torch.nn.init.xavier_uniform_(self.pos_bias_u) + torch.nn.init.xavier_uniform_(self.pos_bias_v) + + def rel_shift(self, x: torch.Tensor) -> torch.Tensor: + """Compute relative positional encoding. + + Args: + x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1). + time1 means the length of query vector. + + Returns: + torch.Tensor: Output tensor. + + """ + zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1), + device=x.device, + dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=-1) + + x_padded = x_padded.view(x.size()[0], + x.size()[1], + x.size(3) + 1, x.size(2)) + x = x_padded[:, :, 1:].view_as(x)[ + :, :, :, : x.size(-1) // 2 + 1 + ] # only keep the positions from 0 to time2 + return x + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + pos_emb: torch.Tensor = torch.empty(0), + cache: torch.Tensor = torch.zeros((0, 0, 0, 0)) + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute 'Scaled Dot Product Attention' with rel. positional encoding. + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2), (0, 0, 0) means fake mask. + pos_emb (torch.Tensor): Positional embedding tensor + (#batch, time2, size). + cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2), + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2) + where `cache_t == chunk_size * num_decoding_left_chunks` + and `head * d_k == size` + """ + q, k, v = self.forward_qkv(query, key, value) + q = q.transpose(1, 2) # (batch, time1, head, d_k) + + if cache.size(0) > 0: + key_cache, value_cache = torch.split(cache, + cache.size(-1) // 2, + dim=-1) + k = torch.cat([key_cache, k], dim=2) + v = torch.cat([value_cache, v], dim=2) + # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's + # non-trivial to calculate `next_cache_start` here. + new_cache = torch.cat((k, v), dim=-1) + + n_batch_pos = pos_emb.size(0) + p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) + p = p.transpose(1, 2) # (batch, head, time1, d_k) + + # (batch, head, time1, d_k) + q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) + # (batch, head, time1, d_k) + q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) + + # compute attention score + # first compute matrix a and matrix c + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + # (batch, head, time1, time2) + matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) + + # compute matrix b and matrix d + # (batch, head, time1, time2) + matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) + # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used + if matrix_ac.shape != matrix_bd.shape: + matrix_bd = self.rel_shift(matrix_bd) + + scores = (matrix_ac + matrix_bd) / math.sqrt( + self.d_k) # (batch, head, time1, time2) + + return self.forward_attention(v, scores, mask), new_cache + + + +def subsequent_mask( + size: int, + device: torch.device = torch.device("cpu"), +) -> torch.Tensor: + """Create mask for subsequent steps (size, size). + + This mask is used only in decoder which works in an auto-regressive mode. + This means the current step could only do attention with its left steps. + + In encoder, fully attention is used when streaming is not necessary and + the sequence is not long. In this case, no attention mask is needed. + + When streaming is need, chunk-based attention is used in encoder. See + subsequent_chunk_mask for the chunk-based attention mask. + + Args: + size (int): size of mask + str device (str): "cpu" or "cuda" or torch.Tensor.device + dtype (torch.device): result dtype + + Returns: + torch.Tensor: mask + + Examples: + >>> subsequent_mask(3) + [[1, 0, 0], + [1, 1, 0], + [1, 1, 1]] + """ + arange = torch.arange(size, device=device) + mask = arange.expand(size, size) + arange = arange.unsqueeze(-1) + mask = mask <= arange + return mask + + +def subsequent_chunk_mask( + size: int, + chunk_size: int, + num_left_chunks: int = -1, + device: torch.device = torch.device("cpu"), + ) -> torch.Tensor: + """Create mask for subsequent steps (size, size) with chunk size, + this is for streaming encoder + + Args: + size (int): size of mask + chunk_size (int): size of chunk + num_left_chunks (int): number of left chunks + <0: use full chunk + >=0: use num_left_chunks + device (torch.device): "cpu" or "cuda" or torch.Tensor.device + + Returns: + torch.Tensor: mask + + Examples: + >>> subsequent_chunk_mask(4, 2) + [[1, 1, 0, 0], + [1, 1, 0, 0], + [1, 1, 1, 1], + [1, 1, 1, 1]] + """ + ret = torch.zeros(size, size, device=device, dtype=torch.bool) + for i in range(size): + if num_left_chunks < 0: + start = 0 + else: + start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) + ending = min((i // chunk_size + 1) * chunk_size, size) + ret[i, start:ending] = True + return ret + +def add_optional_chunk_mask(xs: torch.Tensor, + masks: torch.Tensor, + use_dynamic_chunk: bool, + use_dynamic_left_chunk: bool, + decoding_chunk_size: int, + static_chunk_size: int, + num_decoding_left_chunks: int, + enable_full_context: bool = True): + """ Apply optional mask for encoder. + + Args: + xs (torch.Tensor): padded input, (B, L, D), L for max length + mask (torch.Tensor): mask for xs, (B, 1, L) + use_dynamic_chunk (bool): whether to use dynamic chunk or not + use_dynamic_left_chunk (bool): whether to use dynamic left chunk for + training. + decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's + 0: default for training, use random dynamic chunk. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + static_chunk_size (int): chunk size for static chunk training/decoding + if it's greater than 0, if use_dynamic_chunk is true, + this parameter will be ignored + num_decoding_left_chunks: number of left chunks, this is for decoding, + the chunk size is decoding_chunk_size. + >=0: use num_decoding_left_chunks + <0: use all left chunks + enable_full_context (bool): + True: chunk size is either [1, 25] or full context(max_len) + False: chunk size ~ U[1, 25] + + Returns: + torch.Tensor: chunk mask of the input xs. + """ + # Whether to use chunk mask or not + if use_dynamic_chunk: + max_len = xs.size(1) + if decoding_chunk_size < 0: + chunk_size = max_len + num_left_chunks = -1 + elif decoding_chunk_size > 0: + chunk_size = decoding_chunk_size + num_left_chunks = num_decoding_left_chunks + else: + # chunk size is either [1, 25] or full context(max_len). + # Since we use 4 times subsampling and allow up to 1s(100 frames) + # delay, the maximum frame is 100 / 4 = 25. + chunk_size = torch.randint(1, max_len, (1, )).item() + num_left_chunks = -1 + if chunk_size > max_len // 2 and enable_full_context: + chunk_size = max_len + else: + chunk_size = chunk_size % 25 + 1 + if use_dynamic_left_chunk: + max_left_chunks = (max_len - 1) // chunk_size + num_left_chunks = torch.randint(0, max_left_chunks, + (1, )).item() + chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, + num_left_chunks, + xs.device) # (L, L) + chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) + chunk_masks = masks & chunk_masks # (B, L, L) + elif static_chunk_size > 0: + num_left_chunks = num_decoding_left_chunks + chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, + num_left_chunks, + xs.device) # (L, L) + chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) + chunk_masks = masks & chunk_masks # (B, L, L) + else: + chunk_masks = masks + return chunk_masks + + +class ConformerEncoderLayer(nn.Module): + """Encoder layer module. + Args: + size (int): Input dimension. + self_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` + instance can be used as the argument. + feed_forward (torch.nn.Module): Feed-forward module instance. + `PositionwiseFeedForward` instance can be used as the argument. + feed_forward_macaron (torch.nn.Module): Additional feed-forward module + instance. + `PositionwiseFeedForward` instance can be used as the argument. + conv_module (torch.nn.Module): Convolution module instance. + `ConvlutionModule` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): + True: use layer_norm before each sub-block. + False: use layer_norm after each sub-block. + """ + + def __init__( + self, + size: int, + self_attn: torch.nn.Module, + feed_forward: Optional[nn.Module] = None, + feed_forward_macaron: Optional[nn.Module] = None, + conv_module: Optional[nn.Module] = None, + dropout_rate: float = 0.1, + normalize_before: bool = True, + ): + """Construct an EncoderLayer object.""" + super().__init__() + self.self_attn = self_attn + self.feed_forward = feed_forward + self.feed_forward_macaron = feed_forward_macaron + self.conv_module = conv_module + self.norm_ff = nn.LayerNorm(size, eps=1e-5) # for the FNN module + self.norm_mha = nn.LayerNorm(size, eps=1e-5) # for the MHA module + if feed_forward_macaron is not None: + self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5) + self.ff_scale = 0.5 + else: + self.ff_scale = 1.0 + if self.conv_module is not None: + self.norm_conv = nn.LayerNorm(size, eps=1e-5) # for the CNN module + self.norm_final = nn.LayerNorm( + size, eps=1e-5) # for the final output of the block + self.dropout = nn.Dropout(dropout_rate) + self.size = size + self.normalize_before = normalize_before + + def forward( + self, + x: torch.Tensor, + mask: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool), + att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), + cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)), + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Compute encoded features. + + Args: + x (torch.Tensor): (#batch, time, size) + mask (torch.Tensor): Mask tensor for the input (#batch, time,time), + (0, 0, 0) means fake mask. + pos_emb (torch.Tensor): positional encoding, must not be None + for ConformerEncoderLayer. + mask_pad (torch.Tensor): batch padding mask used for conv module. + (#batch, 1,time), (0, 0, 0) means fake mask. + att_cache (torch.Tensor): Cache tensor of the KEY & VALUE + (#batch=1, head, cache_t1, d_k * 2), head * d_k == size. + cnn_cache (torch.Tensor): Convolution cache in conformer layer + (#batch=1, size, cache_t2) + Returns: + torch.Tensor: Output tensor (#batch, time, size). + torch.Tensor: Mask tensor (#batch, time, time). + torch.Tensor: att_cache tensor, + (#batch=1, head, cache_t1 + time, d_k * 2). + torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2). + """ + + # whether to use macaron style + if self.feed_forward_macaron is not None: + residual = x + if self.normalize_before: + x = self.norm_ff_macaron(x) + x = residual + self.ff_scale * self.dropout( + self.feed_forward_macaron(x)) + if not self.normalize_before: + x = self.norm_ff_macaron(x) + + # multi-headed self-attention module + residual = x + if self.normalize_before: + x = self.norm_mha(x) + x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, + att_cache) + x = residual + self.dropout(x_att) + if not self.normalize_before: + x = self.norm_mha(x) + + # convolution module + # Fake new cnn cache here, and then change it in conv_module + new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device) + if self.conv_module is not None: + residual = x + if self.normalize_before: + x = self.norm_conv(x) + x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache) + x = residual + self.dropout(x) + + if not self.normalize_before: + x = self.norm_conv(x) + + # feed forward module + residual = x + if self.normalize_before: + x = self.norm_ff(x) + + x = residual + self.ff_scale * self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm_ff(x) + + if self.conv_module is not None: + x = self.norm_final(x) + + return x, mask, new_att_cache, new_cnn_cache + + + +class EspnetRelPositionalEncoding(torch.nn.Module): + """Relative positional encoding module (new implementation). + + Details can be found in https://github.com/espnet/espnet/pull/2816. + + See : Appendix B in https://arxiv.org/abs/1901.02860 + + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int): Maximum input length. + + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + """Construct an PositionalEncoding object.""" + super(EspnetRelPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x: torch.Tensor): + """Reset the positional encodings.""" + if self.pe is not None: + # self.pe contains both positive and negative parts + # the length of self.pe is 2 * input_len - 1 + if self.pe.size(1) >= x.size(1) * 2 - 1: + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + # Suppose `i` means to the position of query vecotr and `j` means the + # position of key vector. We use position relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (i Tuple[torch.Tensor, torch.Tensor]: + """Add positional encoding. + + Args: + x (torch.Tensor): Input tensor (batch, time, `*`). + + Returns: + torch.Tensor: Encoded tensor (batch, time, `*`). + + """ + self.extend_pe(x) + x = x * self.xscale + pos_emb = self.position_encoding(size=x.size(1), offset=offset) + return self.dropout(x), self.dropout(pos_emb) + + def position_encoding(self, + offset: Union[int, torch.Tensor], + size: int) -> torch.Tensor: + """ For getting encoding in a streaming fashion + + Attention!!!!! + we apply dropout only once at the whole utterance level in a none + streaming way, but will call this function several times with + increasing input size in a streaming scenario, so the dropout will + be applied several times. + + Args: + offset (int or torch.tensor): start offset + size (int): required size of position encoding + + Returns: + torch.Tensor: Corresponding encoding + """ + pos_emb = self.pe[ + :, + self.pe.size(1) // 2 - size + 1: self.pe.size(1) // 2 + size, + ] + return pos_emb + + + +class LinearEmbed(torch.nn.Module): + """Linear transform the input without subsampling + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, idim: int, odim: int, dropout_rate: float, + pos_enc_class: torch.nn.Module): + """Construct an linear object.""" + super().__init__() + self.out = torch.nn.Sequential( + torch.nn.Linear(idim, odim), + torch.nn.LayerNorm(odim, eps=1e-5), + torch.nn.Dropout(dropout_rate), + ) + self.pos_enc = pos_enc_class #rel_pos_espnet + + def position_encoding(self, offset: Union[int, torch.Tensor], + size: int) -> torch.Tensor: + return self.pos_enc.position_encoding(offset, size) + + def forward( + self, + x: torch.Tensor, + offset: Union[int, torch.Tensor] = 0 + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Input x. + + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: linear input tensor (#batch, time', odim), + where time' = time . + torch.Tensor: linear input mask (#batch, 1, time'), + where time' = time . + + """ + x = self.out(x) + x, pos_emb = self.pos_enc(x, offset) + return x, pos_emb + + +ATTENTION_CLASSES = { + "selfattn": MultiHeadedAttention, + "rel_selfattn": RelPositionMultiHeadedAttention, +} + +ACTIVATION_CLASSES = { + "hardtanh": torch.nn.Hardtanh, + "tanh": torch.nn.Tanh, + "relu": torch.nn.ReLU, + "selu": torch.nn.SELU, + "swish": getattr(torch.nn, "SiLU", Swish), + "gelu": torch.nn.GELU, +} + + +def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor: + """Make mask tensor containing indices of padded part. + + See description of make_non_pad_mask. + + Args: + lengths (torch.Tensor): Batch of lengths (B,). + Returns: + torch.Tensor: Mask tensor containing indices of padded part. + + Examples: + >>> lengths = [5, 3, 2] + >>> make_pad_mask(lengths) + masks = [[0, 0, 0, 0 ,0], + [0, 0, 0, 1, 1], + [0, 0, 1, 1, 1]] + """ + batch_size = lengths.size(0) + max_len = max_len if max_len > 0 else lengths.max().item() + seq_range = torch.arange(0, + max_len, + dtype=torch.int64, + device=lengths.device) + seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) + seq_length_expand = lengths.unsqueeze(-1) + mask = seq_range_expand >= seq_length_expand + return mask + +#https://github.com/FunAudioLLM/CosyVoice/blob/main/examples/magicdata-read/cosyvoice/conf/cosyvoice.yaml +class ConformerEncoder(torch.nn.Module): + """Conformer encoder module.""" + + def __init__( + self, + input_size: int, + output_size: int = 1024, + attention_heads: int = 16, + linear_units: int = 4096, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + input_layer: str = 'linear', + pos_enc_layer_type: str = 'rel_pos_espnet', + normalize_before: bool = True, + static_chunk_size: int = 1, # 1: causal_mask; 0: full_mask + use_dynamic_chunk: bool = False, + use_dynamic_left_chunk: bool = False, + positionwise_conv_kernel_size: int = 1, + macaron_style: bool =False, + selfattention_layer_type: str = "rel_selfattn", + activation_type: str = "swish", + use_cnn_module: bool = False, + cnn_module_kernel: int = 15, + causal: bool = False, + cnn_module_norm: str = "batch_norm", + key_bias: bool = True, + gradient_checkpointing: bool = False, + ): + """Construct ConformerEncoder + + Args: + input_size to use_dynamic_chunk, see in BaseEncoder + positionwise_conv_kernel_size (int): Kernel size of positionwise + conv1d layer. + macaron_style (bool): Whether to use macaron style for + positionwise layer. + selfattention_layer_type (str): Encoder attention layer type, + the parameter has no effect now, it's just for configure + compatibility. #'rel_selfattn' + activation_type (str): Encoder activation function type. + use_cnn_module (bool): Whether to use convolution module. + cnn_module_kernel (int): Kernel size of convolution module. + causal (bool): whether to use causal convolution or not. + key_bias: whether use bias in attention.linear_k, False for whisper models. + """ + super().__init__() + self.output_size = output_size + self.embed = LinearEmbed(input_size, output_size, dropout_rate, + EspnetRelPositionalEncoding(output_size, positional_dropout_rate)) + self.normalize_before = normalize_before + self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5) + self.gradient_checkpointing = gradient_checkpointing + self.use_dynamic_chunk = use_dynamic_chunk + + self.static_chunk_size = static_chunk_size + self.use_dynamic_chunk = use_dynamic_chunk + self.use_dynamic_left_chunk = use_dynamic_left_chunk + activation = ACTIVATION_CLASSES[activation_type]() + + # self-attention module definition + encoder_selfattn_layer_args = ( + attention_heads, + output_size, + attention_dropout_rate, + key_bias, + ) + # feed-forward module definition + positionwise_layer_args = ( + output_size, + linear_units, + dropout_rate, + activation, + ) + # convolution module definition + convolution_layer_args = (output_size, cnn_module_kernel, activation, + cnn_module_norm, causal) + + self.encoders = torch.nn.ModuleList([ + ConformerEncoderLayer( + output_size, + RelPositionMultiHeadedAttention( + *encoder_selfattn_layer_args), + PositionwiseFeedForward(*positionwise_layer_args), + PositionwiseFeedForward( + *positionwise_layer_args) if macaron_style else None, + ConvolutionModule( + *convolution_layer_args) if use_cnn_module else None, + dropout_rate, + normalize_before, + ) for _ in range(num_blocks) + ]) + + def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor) -> torch.Tensor: + for layer in self.encoders: + xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) + return xs + + @torch.jit.unused + def forward_layers_checkpointed(self, xs: torch.Tensor, + chunk_masks: torch.Tensor, + pos_emb: torch.Tensor, + mask_pad: torch.Tensor) -> torch.Tensor: + for layer in self.encoders: + xs, chunk_masks, _, _ = ckpt.checkpoint(layer.__call__, xs, + chunk_masks, pos_emb, + mask_pad) + return xs + + def forward( + self, + xs: torch.Tensor, + pad_mask: torch.Tensor, + decoding_chunk_size: int = 0, + num_decoding_left_chunks: int = -1, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Embed positions in tensor. + + Args: + xs: padded input tensor (B, T, D) + xs_lens: input length (B) + decoding_chunk_size: decoding chunk size for dynamic chunk + 0: default for training, use random dynamic chunk. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + num_decoding_left_chunks: number of left chunks, this is for decoding, + the chunk size is decoding_chunk_size. + >=0: use num_decoding_left_chunks + <0: use all left chunks + Returns: + encoder output tensor xs, and subsampled masks + xs: padded output tensor (B, T' ~= T/subsample_rate, D) + masks: torch.Tensor batch padding mask after subsample + (B, 1, T' ~= T/subsample_rate) + NOTE(xcsong): + We pass the `__call__` method of the modules instead of `forward` to the + checkpointing API because `__call__` attaches all the hooks of the module. + https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2 + """ + T = xs.size(1) + masks = pad_mask.to(torch.bool).unsqueeze(1) # (B, 1, T) + xs, pos_emb = self.embed(xs) + mask_pad = masks # (B, 1, T/subsample_rate) + chunk_masks = add_optional_chunk_mask(xs, masks, + self.use_dynamic_chunk, + self.use_dynamic_left_chunk, + decoding_chunk_size, + self.static_chunk_size, + num_decoding_left_chunks) + if self.gradient_checkpointing and self.training: + xs = self.forward_layers_checkpointed(xs, chunk_masks, pos_emb, + mask_pad) + else: + xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad) + if self.normalize_before: + xs = self.after_norm(xs) + # Here we assume the mask is not changed in encoder layers, so just + # return the masks before encoder layers, and the masks will be used + # for cross attention with decoder later + return xs, masks + diff --git a/models/lyrics_utils/lyric_normalizer.py b/models/lyrics_utils/lyric_normalizer.py new file mode 100644 index 0000000..153cb79 --- /dev/null +++ b/models/lyrics_utils/lyric_normalizer.py @@ -0,0 +1,66 @@ +import re +from opencc import OpenCC + + +t2s_converter = OpenCC('t2s') +s2t_converter = OpenCC('s2t') + + +EMOJI_PATTERN = re.compile( + "[" + "\U0001F600-\U0001F64F" # Emoticons + "]+", flags=re.UNICODE +) + +# 创建一个翻译表,用于替换和移除字符 +TRANSLATION_TABLE = str.maketrans({ + '-': ' ', # 将 '-' 替换为空格 + ',': None, + '.': None, + ',': None, + '。': None, + '!': None, + '!': None, + '?': None, + '?': None, + '…': None, + ';': None, + ';': None, + ':': None, + ':': None, + '\u3000': ' ', # 将全角空格替换为空格 +}) + +# 替换括号中的内容,包括中括号和小括号 +BACKSLASH_PATTERN = re.compile(r'\(.*?\)|\[.*?\]') + +SPACE_PATTERN = re.compile('(?= text_split_length: + text_splits.append("") + nlp = get_spacy_lang(lang) + nlp.add_pipe("sentencizer") + doc = nlp(text) + for sentence in doc.sents: + if len(text_splits[-1]) + len(str(sentence)) <= text_split_length: + # if the last sentence + the current sentence is less than the text_split_length + # then add the current sentence to the last sentence + text_splits[-1] += " " + str(sentence) + text_splits[-1] = text_splits[-1].lstrip() + elif len(str(sentence)) > text_split_length: + # if the current sentence is greater than the text_split_length + for line in textwrap.wrap( + str(sentence), + width=text_split_length, + drop_whitespace=True, + break_on_hyphens=False, + tabsize=1, + ): + text_splits.append(str(line)) + else: + text_splits.append(str(sentence)) + + if len(text_splits) > 1: + if text_splits[0] == "": + del text_splits[0] + else: + text_splits = [text.lstrip()] + + return text_splits + + +_whitespace_re = re.compile(r"\s+") + +# List of (regular expression, replacement) pairs for abbreviations: +_abbreviations = { + "en": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + ("mrs", "misess"), + ("mr", "mister"), + ("dr", "doctor"), + ("st", "saint"), + ("co", "company"), + ("jr", "junior"), + ("maj", "major"), + ("gen", "general"), + ("drs", "doctors"), + ("rev", "reverend"), + ("lt", "lieutenant"), + ("hon", "honorable"), + ("sgt", "sergeant"), + ("capt", "captain"), + ("esq", "esquire"), + ("ltd", "limited"), + ("col", "colonel"), + ("ft", "fort"), + ] + ], + "es": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + ("sra", "señora"), + ("sr", "señor"), + ("dr", "doctor"), + ("dra", "doctora"), + ("st", "santo"), + ("co", "compañía"), + ("jr", "junior"), + ("ltd", "limitada"), + ] + ], + "fr": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + ("mme", "madame"), + ("mr", "monsieur"), + ("dr", "docteur"), + ("st", "saint"), + ("co", "compagnie"), + ("jr", "junior"), + ("ltd", "limitée"), + ] + ], + "de": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + ("fr", "frau"), + ("dr", "doktor"), + ("st", "sankt"), + ("co", "firma"), + ("jr", "junior"), + ] + ], + "pt": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + ("sra", "senhora"), + ("sr", "senhor"), + ("dr", "doutor"), + ("dra", "doutora"), + ("st", "santo"), + ("co", "companhia"), + ("jr", "júnior"), + ("ltd", "limitada"), + ] + ], + "it": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + # ("sig.ra", "signora"), + ("sig", "signore"), + ("dr", "dottore"), + ("st", "santo"), + ("co", "compagnia"), + ("jr", "junior"), + ("ltd", "limitata"), + ] + ], + "pl": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + ("p", "pani"), + ("m", "pan"), + ("dr", "doktor"), + ("sw", "święty"), + ("jr", "junior"), + ] + ], + "ar": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + # There are not many common abbreviations in Arabic as in English. + ] + ], + "zh": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts. + ] + ], + "cs": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + ("dr", "doktor"), # doctor + ("ing", "inženýr"), # engineer + ("p", "pan"), # Could also map to pani for woman but no easy way to do it + # Other abbreviations would be specialized and not as common. + ] + ], + "ru": [ + (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1]) + for x in [ + ("г-жа", "госпожа"), # Mrs. + ("г-н", "господин"), # Mr. + ("д-р", "доктор"), # doctor + # Other abbreviations are less common or specialized. + ] + ], + "nl": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + ("dhr", "de heer"), # Mr. + ("mevr", "mevrouw"), # Mrs. + ("dr", "dokter"), # doctor + ("jhr", "jonkheer"), # young lord or nobleman + # Dutch uses more abbreviations, but these are the most common ones. + ] + ], + "tr": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + ("b", "bay"), # Mr. + ("byk", "büyük"), # büyük + ("dr", "doktor"), # doctor + # Add other Turkish abbreviations here if needed. + ] + ], + "hu": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + ("dr", "doktor"), # doctor + ("b", "bácsi"), # Mr. + ("nőv", "nővér"), # nurse + # Add other Hungarian abbreviations here if needed. + ] + ], + "ko": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + # Korean doesn't typically use abbreviations in the same way as Latin-based scripts. + ] + ], +} + + +def expand_abbreviations_multilingual(text, lang="en"): + for regex, replacement in _abbreviations[lang]: + text = re.sub(regex, replacement, text) + return text + + +_symbols_multilingual = { + "en": [ + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " and "), + ("@", " at "), + ("%", " percent "), + ("#", " hash "), + ("$", " dollar "), + ("£", " pound "), + ("°", " degree "), + ] + ], + "es": [ + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " y "), + ("@", " arroba "), + ("%", " por ciento "), + ("#", " numeral "), + ("$", " dolar "), + ("£", " libra "), + ("°", " grados "), + ] + ], + "fr": [ + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " et "), + ("@", " arobase "), + ("%", " pour cent "), + ("#", " dièse "), + ("$", " dollar "), + ("£", " livre "), + ("°", " degrés "), + ] + ], + "de": [ + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " und "), + ("@", " at "), + ("%", " prozent "), + ("#", " raute "), + ("$", " dollar "), + ("£", " pfund "), + ("°", " grad "), + ] + ], + "pt": [ + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " e "), + ("@", " arroba "), + ("%", " por cento "), + ("#", " cardinal "), + ("$", " dólar "), + ("£", " libra "), + ("°", " graus "), + ] + ], + "it": [ + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " e "), + ("@", " chiocciola "), + ("%", " per cento "), + ("#", " cancelletto "), + ("$", " dollaro "), + ("£", " sterlina "), + ("°", " gradi "), + ] + ], + "pl": [ + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " i "), + ("@", " małpa "), + ("%", " procent "), + ("#", " krzyżyk "), + ("$", " dolar "), + ("£", " funt "), + ("°", " stopnie "), + ] + ], + "ar": [ + # Arabic + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " و "), + ("@", " على "), + ("%", " في المئة "), + ("#", " رقم "), + ("$", " دولار "), + ("£", " جنيه "), + ("°", " درجة "), + ] + ], + "zh": [ + # Chinese + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " 和 "), + ("@", " 在 "), + ("%", " 百分之 "), + ("#", " 号 "), + ("$", " 美元 "), + ("£", " 英镑 "), + ("°", " 度 "), + ] + ], + "cs": [ + # Czech + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " a "), + ("@", " na "), + ("%", " procento "), + ("#", " křížek "), + ("$", " dolar "), + ("£", " libra "), + ("°", " stupně "), + ] + ], + "ru": [ + # Russian + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " и "), + ("@", " собака "), + ("%", " процентов "), + ("#", " номер "), + ("$", " доллар "), + ("£", " фунт "), + ("°", " градус "), + ] + ], + "nl": [ + # Dutch + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " en "), + ("@", " bij "), + ("%", " procent "), + ("#", " hekje "), + ("$", " dollar "), + ("£", " pond "), + ("°", " graden "), + ] + ], + "tr": [ + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " ve "), + ("@", " at "), + ("%", " yüzde "), + ("#", " diyez "), + ("$", " dolar "), + ("£", " sterlin "), + ("°", " derece "), + ] + ], + "hu": [ + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " és "), + ("@", " kukac "), + ("%", " százalék "), + ("#", " kettőskereszt "), + ("$", " dollár "), + ("£", " font "), + ("°", " fok "), + ] + ], + "ko": [ + # Korean + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " 그리고 "), + ("@", " 에 "), + ("%", " 퍼센트 "), + ("#", " 번호 "), + ("$", " 달러 "), + ("£", " 파운드 "), + ("°", " 도 "), + ] + ], +} + + +def expand_symbols_multilingual(text, lang="en"): + for regex, replacement in _symbols_multilingual[lang]: + text = re.sub(regex, replacement, text) + text = text.replace(" ", " ") # Ensure there are no double spaces + return text.strip() + + +_ordinal_re = { + "en": re.compile(r"([0-9]+)(st|nd|rd|th)"), + "es": re.compile(r"([0-9]+)(º|ª|er|o|a|os|as)"), + "fr": re.compile(r"([0-9]+)(º|ª|er|re|e|ème)"), + "de": re.compile(r"([0-9]+)(st|nd|rd|th|º|ª|\.(?=\s|$))"), + "pt": re.compile(r"([0-9]+)(º|ª|o|a|os|as)"), + "it": re.compile(r"([0-9]+)(º|°|ª|o|a|i|e)"), + "pl": re.compile(r"([0-9]+)(º|ª|st|nd|rd|th)"), + "ar": re.compile(r"([0-9]+)(ون|ين|ث|ر|ى)"), + "cs": re.compile(r"([0-9]+)\.(?=\s|$)"), # In Czech, a dot is often used after the number to indicate ordinals. + "ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"), + "nl": re.compile(r"([0-9]+)(de|ste|e)"), + "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"), + "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"), + "ko": re.compile(r"([0-9]+)(번째|번|차|째)"), +} +_number_re = re.compile(r"[0-9]+") +_currency_re = { + "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"), + "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"), + "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"), +} + +_comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b") +_dot_number_re = re.compile(r"\b\d{1,3}(.\d{3})*(\,\d+)?\b") +_decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)") + + +def _remove_commas(m): + text = m.group(0) + if "," in text: + text = text.replace(",", "") + return text + + +def _remove_dots(m): + text = m.group(0) + if "." in text: + text = text.replace(".", "") + return text + + +def _expand_decimal_point(m, lang="en"): + amount = m.group(1).replace(",", ".") + return num2words(float(amount), lang=lang if lang != "cs" else "cz") + + +def _expand_currency(m, lang="en", currency="USD"): + amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", ".")))) + full_amount = num2words(amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz") + + and_equivalents = { + "en": ", ", + "es": " con ", + "fr": " et ", + "de": " und ", + "pt": " e ", + "it": " e ", + "pl": ", ", + "cs": ", ", + "ru": ", ", + "nl": ", ", + "ar": ", ", + "tr": ", ", + "hu": ", ", + "ko": ", ", + } + + if amount.is_integer(): + last_and = full_amount.rfind(and_equivalents[lang]) + if last_and != -1: + full_amount = full_amount[:last_and] + + return full_amount + + +def _expand_ordinal(m, lang="en"): + return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz") + + +def _expand_number(m, lang="en"): + return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz") + + +def expand_numbers_multilingual(text, lang="en"): + if lang == "zh": + text = zh_num2words()(text) + else: + if lang in ["en", "ru"]: + text = re.sub(_comma_number_re, _remove_commas, text) + else: + text = re.sub(_dot_number_re, _remove_dots, text) + try: + text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text) + text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text) + text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text) + except: + pass + if lang != "tr": + text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text) + text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text) + text = re.sub(_number_re, lambda m: _expand_number(m, lang), text) + return text + + +def lowercase(text): + return text.lower() + + +def collapse_whitespace(text): + return re.sub(_whitespace_re, " ", text) + + +def multilingual_cleaners(text, lang): + text = text.replace('"', "") + if lang == "tr": + text = text.replace("İ", "i") + text = text.replace("Ö", "ö") + text = text.replace("Ü", "ü") + text = lowercase(text) + try: + text = expand_numbers_multilingual(text, lang) + except: + pass + try: + text = expand_abbreviations_multilingual(text, lang) + except: + pass + try: + text = expand_symbols_multilingual(text, lang=lang) + except: + pass + text = collapse_whitespace(text) + return text + + +def basic_cleaners(text): + """Basic pipeline that lowercases and collapses whitespace without transliteration.""" + text = lowercase(text) + text = collapse_whitespace(text) + return text + + +def chinese_transliterate(text): + return "".join( + [p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)] + ) + + +def japanese_cleaners(text, katsu): + text = katsu.romaji(text) + text = lowercase(text) + return text + + +def korean_transliterate(text): + r = Transliter(academic) + return r.translit(text) + + +DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "vocab.json") + + +class VoiceBpeTokenizer: + def __init__(self, vocab_file=DEFAULT_VOCAB_FILE): + self.tokenizer = None + if vocab_file is not None: + self.tokenizer = Tokenizer.from_file(vocab_file) + self.char_limits = { + "en": 10000, + "de": 253, + "fr": 273, + "es": 239, + "it": 213, + "pt": 203, + "pl": 224, + "zh": 82, + "ar": 166, + "cs": 186, + "ru": 182, + "nl": 251, + "tr": 226, + "ja": 71, + "hu": 224, + "ko": 95, + } + + @cached_property + def katsu(self): + import cutlet + + return cutlet.Cutlet() + + def check_input_length(self, txt, lang): + lang = lang.split("-")[0] # remove the region + limit = self.char_limits.get(lang, 250) + # if len(txt) > limit: + # print( + # f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio." + # ) + + def preprocess_text(self, txt, lang): + if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}: + txt = multilingual_cleaners(txt, lang) + if lang == "zh": + txt = chinese_transliterate(txt) + if lang == "ko": + txt = korean_transliterate(txt) + elif lang == "ja": + txt = japanese_cleaners(txt, self.katsu) + elif lang == "hi": + # @manmay will implement this + txt = basic_cleaners(txt) + else: + raise NotImplementedError(f"Language '{lang}' is not supported.") + return txt + + def encode(self, txt, lang): + lang = lang.split("-")[0] # remove the region + self.check_input_length(txt, lang) + txt = self.preprocess_text(txt, lang) + lang = "zh-cn" if lang == "zh" else lang + txt = f"[{lang}]{txt}" + txt = txt.replace(" ", "[SPACE]") + return self.tokenizer.encode(txt).ids + + def decode(self, seq, skip_special_tokens=False): + if isinstance(seq, torch.Tensor): + seq = seq.cpu().numpy() + txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "") + txt = txt.replace("[SPACE]", " ") + txt = txt.replace("[STOP]", "") + # txt = txt.replace("[UNK]", "") + return txt + + + #copy from https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L3936 + def batch_decode( + self, + sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"], + skip_special_tokens: bool = False, + ) -> List[str]: + """ + Convert a list of lists of token ids into a list of strings by calling decode. + + Args: + sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`): + List of tokenized input ids. Can be obtained using the `__call__` method. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the decoding. + kwargs (additional keyword arguments, *optional*): + Will be passed to the underlying model specific decode method. + + Returns: + `List[str]`: The list of decoded sentences. + """ + return [ + self.decode(seq) + for seq in sequences + ] + + #https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/layers/xtts/trainer/dataset.py#L202 + # def pad(self): + + def __len__(self): + return self.tokenizer.get_vocab_size() + + def get_number_tokens(self): + return max(self.tokenizer.get_vocab().values()) + 1 + + +def test_expand_numbers_multilingual(): + test_cases = [ + # English + ("In 12.5 seconds.", "In twelve point five seconds.", "en"), + ("There were 50 soldiers.", "There were fifty soldiers.", "en"), + ("This is a 1st test", "This is a first test", "en"), + ("That will be $20 sir.", "That will be twenty dollars sir.", "en"), + ("That will be 20€ sir.", "That will be twenty euro sir.", "en"), + ("That will be 20.15€ sir.", "That will be twenty euro, fifteen cents sir.", "en"), + ("That's 100,000.5.", "That's one hundred thousand point five.", "en"), + # French + ("En 12,5 secondes.", "En douze virgule cinq secondes.", "fr"), + ("Il y avait 50 soldats.", "Il y avait cinquante soldats.", "fr"), + ("Ceci est un 1er test", "Ceci est un premier test", "fr"), + ("Cela vous fera $20 monsieur.", "Cela vous fera vingt dollars monsieur.", "fr"), + ("Cela vous fera 20€ monsieur.", "Cela vous fera vingt euros monsieur.", "fr"), + ("Cela vous fera 20,15€ monsieur.", "Cela vous fera vingt euros et quinze centimes monsieur.", "fr"), + ("Ce sera 100.000,5.", "Ce sera cent mille virgule cinq.", "fr"), + # German + ("In 12,5 Sekunden.", "In zwölf Komma fünf Sekunden.", "de"), + ("Es gab 50 Soldaten.", "Es gab fünfzig Soldaten.", "de"), + ("Dies ist ein 1. Test", "Dies ist ein erste Test", "de"), # Issue with gender + ("Das macht $20 Herr.", "Das macht zwanzig Dollar Herr.", "de"), + ("Das macht 20€ Herr.", "Das macht zwanzig Euro Herr.", "de"), + ("Das macht 20,15€ Herr.", "Das macht zwanzig Euro und fünfzehn Cent Herr.", "de"), + # Spanish + ("En 12,5 segundos.", "En doce punto cinco segundos.", "es"), + ("Había 50 soldados.", "Había cincuenta soldados.", "es"), + ("Este es un 1er test", "Este es un primero test", "es"), + ("Eso le costará $20 señor.", "Eso le costará veinte dólares señor.", "es"), + ("Eso le costará 20€ señor.", "Eso le costará veinte euros señor.", "es"), + ("Eso le costará 20,15€ señor.", "Eso le costará veinte euros con quince céntimos señor.", "es"), + # Italian + ("In 12,5 secondi.", "In dodici virgola cinque secondi.", "it"), + ("C'erano 50 soldati.", "C'erano cinquanta soldati.", "it"), + ("Questo è un 1° test", "Questo è un primo test", "it"), + ("Ti costerà $20 signore.", "Ti costerà venti dollari signore.", "it"), + ("Ti costerà 20€ signore.", "Ti costerà venti euro signore.", "it"), + ("Ti costerà 20,15€ signore.", "Ti costerà venti euro e quindici centesimi signore.", "it"), + # Portuguese + ("Em 12,5 segundos.", "Em doze vírgula cinco segundos.", "pt"), + ("Havia 50 soldados.", "Havia cinquenta soldados.", "pt"), + ("Este é um 1º teste", "Este é um primeiro teste", "pt"), + ("Isso custará $20 senhor.", "Isso custará vinte dólares senhor.", "pt"), + ("Isso custará 20€ senhor.", "Isso custará vinte euros senhor.", "pt"), + ( + "Isso custará 20,15€ senhor.", + "Isso custará vinte euros e quinze cêntimos senhor.", + "pt", + ), # "cêntimos" should be "centavos" num2words issue + # Polish + ("W 12,5 sekundy.", "W dwanaście przecinek pięć sekundy.", "pl"), + ("Było 50 żołnierzy.", "Było pięćdziesiąt żołnierzy.", "pl"), + ("To będzie kosztować 20€ panie.", "To będzie kosztować dwadzieścia euro panie.", "pl"), + ("To będzie kosztować 20,15€ panie.", "To będzie kosztować dwadzieścia euro, piętnaście centów panie.", "pl"), + # Arabic + ("في الـ 12,5 ثانية.", "في الـ اثنا عشر , خمسون ثانية.", "ar"), + ("كان هناك 50 جنديًا.", "كان هناك خمسون جنديًا.", "ar"), + # ("ستكون النتيجة $20 يا سيد.", 'ستكون النتيجة عشرون دولار يا سيد.', 'ar'), # $ and € are mising from num2words + # ("ستكون النتيجة 20€ يا سيد.", 'ستكون النتيجة عشرون يورو يا سيد.', 'ar'), + # Czech + ("Za 12,5 vteřiny.", "Za dvanáct celá pět vteřiny.", "cs"), + ("Bylo tam 50 vojáků.", "Bylo tam padesát vojáků.", "cs"), + ("To bude stát 20€ pane.", "To bude stát dvacet euro pane.", "cs"), + ("To bude 20.15€ pane.", "To bude dvacet euro, patnáct centů pane.", "cs"), + # Russian + ("Через 12.5 секунды.", "Через двенадцать запятая пять секунды.", "ru"), + ("Там было 50 солдат.", "Там было пятьдесят солдат.", "ru"), + ("Это будет 20.15€ сэр.", "Это будет двадцать евро, пятнадцать центов сэр.", "ru"), + ("Это будет стоить 20€ господин.", "Это будет стоить двадцать евро господин.", "ru"), + # Dutch + ("In 12,5 seconden.", "In twaalf komma vijf seconden.", "nl"), + ("Er waren 50 soldaten.", "Er waren vijftig soldaten.", "nl"), + ("Dat wordt dan $20 meneer.", "Dat wordt dan twintig dollar meneer.", "nl"), + ("Dat wordt dan 20€ meneer.", "Dat wordt dan twintig euro meneer.", "nl"), + # Chinese (Simplified) + ("在12.5秒内", "在十二点五秒内", "zh"), + ("有50名士兵", "有五十名士兵", "zh"), + # ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work + # ("那将是20€先生", '那将是二十欧元先生', 'zh'), + # Turkish + # ("12,5 saniye içinde.", 'On iki virgül beş saniye içinde.', 'tr'), # decimal doesn't work for TR + ("50 asker vardı.", "elli asker vardı.", "tr"), + ("Bu 1. test", "Bu birinci test", "tr"), + # ("Bu 100.000,5.", 'Bu yüz bin virgül beş.', 'tr'), + # Hungarian + ("12,5 másodperc alatt.", "tizenkettő egész öt tized másodperc alatt.", "hu"), + ("50 katona volt.", "ötven katona volt.", "hu"), + ("Ez az 1. teszt", "Ez az első teszt", "hu"), + # Korean + ("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"), + ("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"), + ("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"), + ] + for a, b, lang in test_cases: + out = expand_numbers_multilingual(a, lang=lang) + assert out == b, f"'{out}' vs '{b}'" + + +def test_abbreviations_multilingual(): + test_cases = [ + # English + ("Hello Mr. Smith.", "Hello mister Smith.", "en"), + ("Dr. Jones is here.", "doctor Jones is here.", "en"), + # Spanish + ("Hola Sr. Garcia.", "Hola señor Garcia.", "es"), + ("La Dra. Martinez es muy buena.", "La doctora Martinez es muy buena.", "es"), + # French + ("Bonjour Mr. Dupond.", "Bonjour monsieur Dupond.", "fr"), + ("Mme. Moreau est absente aujourd'hui.", "madame Moreau est absente aujourd'hui.", "fr"), + # German + ("Frau Dr. Müller ist sehr klug.", "Frau doktor Müller ist sehr klug.", "de"), + # Portuguese + ("Olá Sr. Silva.", "Olá senhor Silva.", "pt"), + ("Dra. Costa, você está disponível?", "doutora Costa, você está disponível?", "pt"), + # Italian + ("Buongiorno, Sig. Rossi.", "Buongiorno, signore Rossi.", "it"), + # ("Sig.ra Bianchi, posso aiutarti?", 'signora Bianchi, posso aiutarti?', 'it'), # Issue with matching that pattern + # Polish + ("Dzień dobry, P. Kowalski.", "Dzień dobry, pani Kowalski.", "pl"), + ("M. Nowak, czy mogę zadać pytanie?", "pan Nowak, czy mogę zadać pytanie?", "pl"), + # Czech + ("P. Novák", "pan Novák", "cs"), + ("Dr. Vojtěch", "doktor Vojtěch", "cs"), + # Dutch + ("Dhr. Jansen", "de heer Jansen", "nl"), + ("Mevr. de Vries", "mevrouw de Vries", "nl"), + # Russian + ("Здравствуйте Г-н Иванов.", "Здравствуйте господин Иванов.", "ru"), + ("Д-р Смирнов здесь, чтобы увидеть вас.", "доктор Смирнов здесь, чтобы увидеть вас.", "ru"), + # Turkish + ("Merhaba B. Yılmaz.", "Merhaba bay Yılmaz.", "tr"), + ("Dr. Ayşe burada.", "doktor Ayşe burada.", "tr"), + # Hungarian + ("Dr. Szabó itt van.", "doktor Szabó itt van.", "hu"), + ] + + for a, b, lang in test_cases: + out = expand_abbreviations_multilingual(a, lang=lang) + assert out == b, f"'{out}' vs '{b}'" + + +def test_symbols_multilingual(): + test_cases = [ + ("I have 14% battery", "I have 14 percent battery", "en"), + ("Te veo @ la fiesta", "Te veo arroba la fiesta", "es"), + ("J'ai 14° de fièvre", "J'ai 14 degrés de fièvre", "fr"), + ("Die Rechnung beträgt £ 20", "Die Rechnung beträgt pfund 20", "de"), + ("O meu email é ana&joao@gmail.com", "O meu email é ana e joao arroba gmail.com", "pt"), + ("linguaggio di programmazione C#", "linguaggio di programmazione C cancelletto", "it"), + ("Moja temperatura to 36.6°", "Moja temperatura to 36.6 stopnie", "pl"), + ("Mám 14% baterie", "Mám 14 procento baterie", "cs"), + ("Těším se na tebe @ party", "Těším se na tebe na party", "cs"), + ("У меня 14% заряда", "У меня 14 процентов заряда", "ru"), + ("Я буду @ дома", "Я буду собака дома", "ru"), + ("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"), + ("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"), + ("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"), + ("我的电量为 14%", "我的电量为 14 百分之", "zh"), + ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"), + ("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"), + ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"), + ] + + for a, b, lang in test_cases: + out = expand_symbols_multilingual(a, lang=lang) + assert out == b, f"'{out}' vs '{b}'" + + +if __name__ == "__main__": + test_expand_numbers_multilingual() + test_abbreviations_multilingual() + test_symbols_multilingual() \ No newline at end of file diff --git a/models/lyrics_utils/vocab.json b/models/lyrics_utils/vocab.json new file mode 100644 index 0000000..519ed34 --- /dev/null +++ b/models/lyrics_utils/vocab.json @@ -0,0 +1,15535 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "special": true, + "content": "[STOP]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 1, + "special": true, + "content": "[UNK]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 2, + "special": true, + "content": "[SPACE]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 259, + "special": true, + "content": "[en]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 260, + "special": true, + "content": "[de]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 261, + "special": true, + "content": "[START]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 262, + "special": true, + "content": "[fr]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 284, + "special": true, + "content": "[es]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 285, + "special": true, + "content": "[it]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 286, + "special": true, + "content": "[pt]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 294, + "special": true, + "content": "[pl]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 295, + "special": true, + "content": "[tr]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 267, + "special": true, + "content": "[ru]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 293, + "special": true, + "content": "[cs]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 297, + "special": true, + "content": "[nl]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 5022, + "special": true, + "content": "[ar]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 5023, + "special": true, + "content": "[zh-cn]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 5412, + "special": true, + "content": "[ja]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 5753, + "special": true, + "content": "[hu]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 6152, + "special": true, + "content": "[ko]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 6680, + "special": true, + "content": "[hi]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 6681, + "special": true, + "content": "[start]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 6682, + "special": true, + "content": "[intro]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 6683, + "special": true, + "content": "[verse]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 6684, + "special": true, + "content": "[chorus]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 6685, + "special": true, + "content": "[bridge]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 6686, + "special": true, + "content": "[outro]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 6687, + "special": true, + "content": "[end]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 6688, + "special": true, + "content": "[inst]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 6689, + "special": true, + "content": "[solo]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 6690, + "special": true, + "content": "[hook]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 6691, + "special": true, + "content": "[pre-chorus]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + }, + { + "id": 6692, + "special": true, + "content": "[break]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": null, + "decoder": null, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": "[UNK]", + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "vocab": { + "[STOP]": 0, + "[UNK]": 1, + "[SPACE]": 2, + "!": 3, + "'": 4, + "(": 5, + ")": 6, + ",": 7, + "-": 8, + ".": 9, + "/": 10, + ":": 11, + ";": 12, + "?": 13, + "a": 14, + "b": 15, + "c": 16, + "d": 17, + "e": 18, + "f": 19, + "g": 20, + "h": 21, + "i": 22, + "j": 23, + "k": 24, + "l": 25, + "m": 26, + "n": 27, + "o": 28, + "p": 29, + "q": 30, + "r": 31, + "s": 32, + "t": 33, + "u": 34, + "v": 35, + "w": 36, + "x": 37, + "y": 38, + "z": 39, + "th": 40, + "in": 41, + "the": 42, + "an": 43, + "er": 44, + "ou": 45, + "re": 46, + "on": 47, + "at": 48, + "ed": 49, + "en": 50, + "to": 51, + "ing": 52, + "and": 53, + "is": 54, + "as": 55, + "al": 56, + "or": 57, + "of": 58, + "ar": 59, + "it": 60, + "es": 61, + "he": 62, + "st": 63, + "le": 64, + "om": 65, + "se": 66, + "be": 67, + "ad": 68, + "ow": 69, + "ly": 70, + "ch": 71, + "wh": 72, + "that": 73, + "you": 74, + "li": 75, + "ve": 76, + "ac": 77, + "ti": 78, + "ld": 79, + "me": 80, + "was": 81, + "gh": 82, + "id": 83, + "ll": 84, + "wi": 85, + "ent": 86, + "for": 87, + "ay": 88, + "ro": 89, + "ver": 90, + "ic": 91, + "her": 92, + "ke": 93, + "his": 94, + "no": 95, + "ut": 96, + "un": 97, + "ir": 98, + "lo": 99, + "we": 100, + "ri": 101, + "ha": 102, + "with": 103, + "ght": 104, + "out": 105, + "im": 106, + "ion": 107, + "all": 108, + "ab": 109, + "one": 110, + "ne": 111, + "ge": 112, + "ould": 113, + "ter": 114, + "mo": 115, + "had": 116, + "ce": 117, + "she": 118, + "go": 119, + "sh": 120, + "ur": 121, + "am": 122, + "so": 123, + "pe": 124, + "my": 125, + "de": 126, + "are": 127, + "but": 128, + "ome": 129, + "fr": 130, + "ther": 131, + "fe": 132, + "su": 133, + "do": 134, + "con": 135, + "te": 136, + "ain": 137, + "ere": 138, + "po": 139, + "if": 140, + "they": 141, + "us": 142, + "ag": 143, + "tr": 144, + "now": 145, + "oun": 146, + "this": 147, + "have": 148, + "not": 149, + "sa": 150, + "il": 151, + "up": 152, + "thing": 153, + "from": 154, + "ap": 155, + "him": 156, + "ack": 157, + "ation": 158, + "ant": 159, + "our": 160, + "op": 161, + "like": 162, + "ust": 163, + "ess": 164, + "bo": 165, + "ok": 166, + "ul": 167, + "ind": 168, + "ex": 169, + "com": 170, + "some": 171, + "there": 172, + "ers": 173, + "co": 174, + "res": 175, + "man": 176, + "ard": 177, + "pl": 178, + "wor": 179, + "way": 180, + "tion": 181, + "fo": 182, + "ca": 183, + "were": 184, + "by": 185, + "ate": 186, + "pro": 187, + "ted": 188, + "ound": 189, + "own": 190, + "would": 191, + "ts": 192, + "what": 193, + "qu": 194, + "ally": 195, + "ight": 196, + "ck": 197, + "gr": 198, + "when": 199, + "ven": 200, + "can": 201, + "ough": 202, + "ine": 203, + "end": 204, + "per": 205, + "ous": 206, + "od": 207, + "ide": 208, + "know": 209, + "ty": 210, + "very": 211, + "si": 212, + "ak": 213, + "who": 214, + "about": 215, + "ill": 216, + "them": 217, + "est": 218, + "red": 219, + "ye": 220, + "could": 221, + "ong": 222, + "your": 223, + "their": 224, + "em": 225, + "just": 226, + "other": 227, + "into": 228, + "any": 229, + "whi": 230, + "um": 231, + "tw": 232, + "ast": 233, + "der": 234, + "did": 235, + "ie": 236, + "been": 237, + "ace": 238, + "ink": 239, + "ity": 240, + "back": 241, + "ting": 242, + "br": 243, + "more": 244, + "ake": 245, + "pp": 246, + "then": 247, + "sp": 248, + "el": 249, + "use": 250, + "bl": 251, + "said": 252, + "over": 253, + "get": 254, + "ß": 255, + "ä": 256, + "ö": 257, + "ü": 258, + "[en]": 259, + "[de]": 260, + "[START]": 261, + "[fr]": 262, + "œ": 263, + "ï": 264, + "ê": 265, + "â": 266, + "[ru]": 267, + "ÿ": 268, + "è": 269, + "à": 270, + "ë": 271, + "ù": 272, + "î": 273, + "ç": 274, + "æ": 275, + "ô": 276, + "û": 277, + "á": 278, + "é": 279, + "í": 280, + "ó": 281, + "ú": 282, + "ñ": 283, + "[es]": 284, + "[it]": 285, + "[pt]": 286, + "ń": 287, + "ś": 288, + "ę": 289, + "ą": 290, + "ż": 291, + "ć": 292, + "[cs]": 293, + "[pl]": 294, + "[tr]": 295, + "ã": 296, + "[nl]": 297, + "ş": 298, + "ğ": 299, + "ı": 300, + "ò": 301, + "ì": 302, + "¿": 303, + "…": 304, + "i̇": 305, + "õ": 306, + "\"": 307, + "´": 308, + "ø": 309, + "č": 310, + "ō": 311, + "š": 312, + "ž": 313, + "̇": 314, + "ei": 315, + "ich": 316, + "ein": 317, + "au": 318, + "sch": 319, + "und": 320, + "die": 321, + "da": 322, + "den": 323, + "gen": 324, + "zu": 325, + "hr": 326, + "ten": 327, + "mi": 328, + "sie": 329, + "das": 330, + "eine": 331, + "icht": 332, + "ber": 333, + "ach": 334, + "auf": 335, + "lich": 336, + "nicht": 337, + "mm": 338, + "ben": 339, + "war": 340, + "mit": 341, + "sich": 342, + "ig": 343, + "aus": 344, + "ist": 345, + "wie": 346, + "och": 347, + "ung": 348, + "ann": 349, + "ür": 350, + "hn": 351, + "ihr": 352, + "sen": 353, + "tz": 354, + "dem": 355, + "eit": 356, + "hat": 357, + "wir": 358, + "von": 359, + "wei": 360, + "ier": 361, + "ra": 362, + "einen": 363, + "vor": 364, + "als": 365, + "wo": 366, + "rei": 367, + "ste": 368, + "lie": 369, + "auch": 370, + "du": 371, + "des": 372, + "ko": 373, + "über": 374, + "bei": 375, + "hen": 376, + "hm": 377, + "lei": 378, + "aber": 379, + "wen": 380, + "hl": 381, + "ger": 382, + "nach": 383, + "ft": 384, + "imm": 385, + "je": 386, + "schen": 387, + "wer": 388, + "ser": 389, + "än": 390, + "sein": 391, + "ol": 392, + "cht": 393, + "für": 394, + "kl": 395, + "ff": 396, + "einem": 397, + "nen": 398, + "ja": 399, + "noch": 400, + "hatte": 401, + "pf": 402, + "hin": 403, + "di": 404, + "chen": 405, + "rü": 406, + "iel": 407, + "sel": 408, + "dass": 409, + "ihn": 410, + "mir": 411, + "schl": 412, + "ön": 413, + "gan": 414, + "gt": 415, + "einer": 416, + "sten": 417, + "mich": 418, + "wenn": 419, + "ell": 420, + "gte": 421, + "mal": 422, + "gel": 423, + "ken": 424, + "nur": 425, + "mmen": 426, + "fü": 427, + "ern": 428, + "ör": 429, + "unter": 430, + "ander": 431, + "dur": 432, + "uch": 433, + "ta": 434, + "men": 435, + "mach": 436, + "doch": 437, + "durch": 438, + "os": 439, + "gl": 440, + "hal": 441, + "ihre": 442, + "wä": 443, + "immer": 444, + "ihm": 445, + "kann": 446, + "ort": 447, + "dann": 448, + "lan": 449, + "tzt": 450, + "oder": 451, + "hren": 452, + "et": 453, + "kön": 454, + "ick": 455, + "fa": 456, + "wieder": 457, + "daß": 458, + "mein": 459, + "fen": 460, + "ganz": 461, + "diese": 462, + "ster": 463, + "dar": 464, + "wa": 465, + "ges": 466, + "na": 467, + "fl": 468, + "igen": 469, + "sche": 470, + "ungen": 471, + "mehr": 472, + "ßen": 473, + "ot": 474, + "kon": 475, + "gew": 476, + "haben": 477, + "geh": 478, + "ät": 479, + "sind": 480, + "dr": 481, + "wel": 482, + "uns": 483, + "vo": 484, + "ma": 485, + "ute": 486, + "schon": 487, + "bes": 488, + "gesch": 489, + "bt": 490, + "che": 491, + "son": 492, + "ob": 493, + "la": 494, + "rück": 495, + "seine": 496, + "kr": 497, + "fre": 498, + "eil": 499, + "zum": 500, + "hier": 501, + "kt": 502, + "ige": 503, + "spr": 504, + "leben": 505, + "bst": 506, + "zeit": 507, + "gro": 508, + "denn": 509, + "ho": 510, + "scha": 511, + "bar": 512, + "alle": 513, + "gegen": 514, + "wür": 515, + "mü": 516, + "ze": 517, + "werden": 518, + "jetzt": 519, + "kommen": 520, + "nie": 521, + "sei": 522, + "heit": 523, + "soll": 524, + "glei": 525, + "meine": 526, + "woll": 527, + "ner": 528, + "habe": 529, + "wur": 530, + "lichen": 531, + "assen": 532, + "nte": 533, + "sehen": 534, + "wird": 535, + "bis": 536, + "gar": 537, + "ien": 538, + "mus": 539, + "uß": 540, + "är": 541, + "stell": 542, + "keit": 543, + "zwei": 544, + "selbst": 545, + "sta": 546, + "pa": 547, + "sagte": 548, + "tet": 549, + "kam": 550, + "ssen": 551, + "viel": 552, + "ug": 553, + "zen": 554, + "hei": 555, + "mann": 556, + "will": 557, + "geb": 558, + "waren": 559, + "ück": 560, + "äch": 561, + "mer": 562, + "ru": 563, + "hau": 564, + "eigen": 565, + "ang": 566, + "weg": 567, + "blick": 568, + "fra": 569, + "alles": 570, + "ka": 571, + "augen": 572, + "fin": 573, + "liche": 574, + "unser": 575, + "dern": 576, + "herr": 577, + "nun": 578, + "vie": 579, + "chte": 580, + "wohl": 581, + "fall": 582, + "ht": 583, + "ün": 584, + "etwas": 585, + "stand": 586, + "äu": 587, + "mö": 588, + "tel": 589, + "rie": 590, + "dich": 591, + "dies": 592, + "hand": 593, + "bin": 594, + "ffen": 595, + "nichts": 596, + "dan": 597, + "hne": 598, + "ihnen": 599, + "esen": 600, + "dieser": 601, + "frau": 602, + "art": 603, + "dir": 604, + "isch": 605, + "erst": 606, + "gleich": 607, + "komm": 608, + "hör": 609, + "ße": 610, + "dig": 611, + "sehr": 612, + "zei": 613, + "sam": 614, + "aum": 615, + "hät": 616, + "ingen": 617, + "gut": 618, + "mut": 619, + "cken": 620, + "konnte": 621, + "stimm": 622, + "zur": 623, + "itz": 624, + "weil": 625, + "würde": 626, + "fä": 627, + "können": 628, + "keine": 629, + "fer": 630, + "ischen": 631, + "voll": 632, + "eines": 633, + "setz": 634, + "zie": 635, + "del": 636, + "tete": 637, + "seiner": 638, + "ieren": 639, + "gest": 640, + "zurück": 641, + "wurde": 642, + "schn": 643, + "pr": 644, + "ließ": 645, + "tra": 646, + "mä": 647, + "gend": 648, + "fol": 649, + "ik": 650, + "schla": 651, + "schaft": 652, + "ater": 653, + "weiß": 654, + "seinen": 655, + "lassen": 656, + "lu": 657, + "unden": 658, + "teil": 659, + "neu": 660, + "iert": 661, + "menschen": 662, + "hmen": 663, + "str": 664, + "gi": 665, + "sah": 666, + "ihren": 667, + "eln": 668, + "weiter": 669, + "gehen": 670, + "iger": 671, + "macht": 672, + "tag": 673, + "also": 674, + "halten": 675, + "nis": 676, + "acht": 677, + "geben": 678, + "og": 679, + "nat": 680, + "mar": 681, + "det": 682, + "ohne": 683, + "haus": 684, + "tro": 685, + "ange": 686, + "lau": 687, + "spiel": 688, + "tre": 689, + "schr": 690, + "inn": 691, + "los": 692, + "machen": 693, + "hätte": 694, + "beg": 695, + "wirk": 696, + "alt": 697, + "glich": 698, + "tes": 699, + "richt": 700, + "freund": 701, + "ihrer": 702, + "fel": 703, + "bel": 704, + "sol": 705, + "einmal": 706, + "eben": 707, + "hol": 708, + "hän": 709, + "tern": 710, + "hö": 711, + "schw": 712, + "recht": 713, + "wahr": 714, + "seinem": 715, + "stehen": 716, + "hlen": 717, + "ins": 718, + "ging": 719, + "wollte": 720, + "wissen": 721, + "ungs": 722, + "ald": 723, + "ass": 724, + "jahr": 725, + "mor": 726, + "welt": 727, + "under": 728, + "zusa": 729, + "kopf": 730, + "lang": 731, + "hinter": 732, + "atz": 733, + "stra": 734, + "angen": 735, + "ank": 736, + "ade": 737, + "glau": 738, + "fach": 739, + "hatten": 740, + "fort": 741, + "eicht": 742, + "iff": 743, + "ler": 744, + "mei": 745, + "diesem": 746, + "kein": 747, + "frei": 748, + "führ": 749, + "vom": 750, + "β": 751, + "ai": 752, + "ait": 753, + "que": 754, + "les": 755, + "av": 756, + "ais": 757, + "oi": 758, + "eu": 759, + "lle": 760, + "par": 761, + "ans": 762, + "ment": 763, + "ét": 764, + "une": 765, + "pas": 766, + "qui": 767, + "elle": 768, + "dé": 769, + "pour": 770, + "dans": 771, + "ré": 772, + "tou": 773, + "vous": 774, + "vi": 775, + "ouv": 776, + "mon": 777, + "sur": 778, + "ci": 779, + "plu": 780, + "ère": 781, + "mais": 782, + "ois": 783, + "plus": 784, + "ée": 785, + "aient": 786, + "mp": 787, + "lui": 788, + "ave": 789, + "était": 790, + "ses": 791, + "tout": 792, + "oir": 793, + "avait": 794, + "és": 795, + "mes": 796, + "nous": 797, + "eux": 798, + "bi": 799, + "ons": 800, + "pu": 801, + "ces": 802, + "tu": 803, + "leur": 804, + "don": 805, + "eur": 806, + "ette": 807, + "aire": 808, + "avec": 809, + "dit": 810, + "té": 811, + "ille": 812, + "comme": 813, + "cr": 814, + "ux": 815, + "ès": 816, + "aux": 817, + "jour": 818, + "ils": 819, + "bien": 820, + "cou": 821, + "quel": 822, + "peu": 823, + "cette": 824, + "cu": 825, + "mê": 826, + "fait": 827, + "gu": 828, + "être": 829, + "ité": 830, + "ens": 831, + "ni": 832, + "lé": 833, + "dis": 834, + "ble": 835, + "né": 836, + "puis": 837, + "même": 838, + "ques": 839, + "fi": 840, + "age": 841, + "moi": 842, + "ence": 843, + "ont": 844, + "main": 845, + "ors": 846, + "aut": 847, + "ance": 848, + "mé": 849, + "sans": 850, + "sé": 851, + "lon": 852, + "hom": 853, + "car": 854, + "able": 855, + "cher": 856, + "deux": 857, + "enf": 858, + "où": 859, + "ph": 860, + "ure": 861, + "temp": 862, + "pos": 863, + "rent": 864, + "pé": 865, + "faire": 866, + "pi": 867, + "tres": 868, + "ça": 869, + "endre": 870, + "bon": 871, + "sou": 872, + "int": 873, + "pré": 874, + "sent": 875, + "tant": 876, + "cer": 877, + "là": 878, + "lais": 879, + "près": 880, + "bre": 881, + "cour": 882, + "pet": 883, + "comp": 884, + "lait": 885, + "trouv": 886, + "entre": 887, + "sont": 888, + "dev": 889, + "nu": 890, + "temps": 891, + "dou": 892, + "rait": 893, + "bou": 894, + "quand": 895, + "jours": 896, + "avoir": 897, + "été": 898, + "ale": 899, + "pre": 900, + "fois": 901, + "orte": 902, + "vé": 903, + "non": 904, + "tous": 905, + "jus": 906, + "coup": 907, + "homme": 908, + "ête": 909, + "aussi": 910, + "urs": 911, + "seu": 912, + "ord": 913, + "min": 914, + "gé": 915, + "core": 916, + "va": 917, + "vre": 918, + "encore": 919, + "sem": 920, + "ite": 921, + "autre": 922, + "pris": 923, + "peut": 924, + "ue": 925, + "ante": 926, + "gn": 927, + "rép": 928, + "hu": 929, + "sion": 930, + "votre": 931, + "dire": 932, + "ez": 933, + "fem": 934, + "leurs": 935, + "met": 936, + "cri": 937, + "mis": 938, + "tour": 939, + "rai": 940, + "jam": 941, + "regar": 942, + "rien": 943, + "vers": 944, + "suis": 945, + "pouv": 946, + "vis": 947, + "grand": 948, + "ants": 949, + "cor": 950, + "rer": 951, + "cé": 952, + "tent": 953, + "pres": 954, + "vou": 955, + "alors": 956, + "sieur": 957, + "aine": 958, + "quoi": 959, + "fon": 960, + "endant": 961, + "arri": 962, + "eure": 963, + "après": 964, + "donc": 965, + "itu": 966, + "lè": 967, + "sait": 968, + "toi": 969, + "cha": 970, + "ail": 971, + "asse": 972, + "imp": 973, + "voy": 974, + "conn": 975, + "pla": 976, + "petit": 977, + "avant": 978, + "nom": 979, + "tin": 980, + "dont": 981, + "sous": 982, + "emp": 983, + "person": 984, + "elles": 985, + "beau": 986, + "parti": 987, + "cho": 988, + "prit": 989, + "toujours": 990, + "rais": 991, + "jamais": 992, + "trav": 993, + "tions": 994, + "très": 995, + "voi": 996, + "ren": 997, + "yeux": 998, + "voir": 999, + "premi": 1000, + "gne": 1001, + "heure": 1002, + "rou": 1003, + "eff": 1004, + "notre": 1005, + "ments": 1006, + "ton": 1007, + "fais": 1008, + "cela": 1009, + "répon": 1010, + "cons": 1011, + "air": 1012, + "ôt": 1013, + "pendant": 1014, + "ici": 1015, + "toute": 1016, + "jet": 1017, + "port": 1018, + "étaient": 1019, + "pen": 1020, + "hé": 1021, + "autres": 1022, + "père": 1023, + "oc": 1024, + "quelques": 1025, + "ique": 1026, + "lis": 1027, + "femme": 1028, + "jou": 1029, + "teur": 1030, + "monde": 1031, + "nes": 1032, + "dre": 1033, + "aff": 1034, + "rap": 1035, + "part": 1036, + "lement": 1037, + "cla": 1038, + "fut": 1039, + "quelque": 1040, + "prendre": 1041, + "rê": 1042, + "aille": 1043, + "sais": 1044, + "ches": 1045, + "let": 1046, + "char": 1047, + "ères": 1048, + "ents": 1049, + "moins": 1050, + "eau": 1051, + "aî": 1052, + "jeu": 1053, + "heur": 1054, + "ées": 1055, + "tri": 1056, + "point": 1057, + "mom": 1058, + "vent": 1059, + "nouv": 1060, + "gran": 1061, + "trois": 1062, + "sant": 1063, + "toutes": 1064, + "contre": 1065, + "èrent": 1066, + "chez": 1067, + "avez": 1068, + "ût": 1069, + "att": 1070, + "pau": 1071, + "porte": 1072, + "ouver": 1073, + "lit": 1074, + "prés": 1075, + "chose": 1076, + "vit": 1077, + "monsieur": 1078, + "hab": 1079, + "tête": 1080, + "ju": 1081, + "tement": 1082, + "ction": 1083, + "vrai": 1084, + "lar": 1085, + "cet": 1086, + "regard": 1087, + "lant": 1088, + "som": 1089, + "moment": 1090, + "illes": 1091, + "ple": 1092, + "ps": 1093, + "mère": 1094, + "cl": 1095, + "sour": 1096, + "ys": 1097, + "trop": 1098, + "enne": 1099, + "jusqu": 1100, + "avaient": 1101, + "avais": 1102, + "jeune": 1103, + "depuis": 1104, + "personne": 1105, + "fit": 1106, + "cert": 1107, + "jo": 1108, + "oui": 1109, + "rest": 1110, + "semb": 1111, + "cap": 1112, + "mat": 1113, + "mu": 1114, + "long": 1115, + "fran": 1116, + "faut": 1117, + "iti": 1118, + "bli": 1119, + "chev": 1120, + "pri": 1121, + "ente": 1122, + "ainsi": 1123, + "cham": 1124, + "lors": 1125, + "cas": 1126, + "ili": 1127, + "bé": 1128, + "nos": 1129, + "sui": 1130, + "rit": 1131, + "cro": 1132, + "gue": 1133, + "ía": 1134, + "por": 1135, + "las": 1136, + "ón": 1137, + "una": 1138, + "aba": 1139, + "dos": 1140, + "era": 1141, + "mb": 1142, + "para": 1143, + "ás": 1144, + "mos": 1145, + "ando": 1146, + "como": 1147, + "más": 1148, + "ción": 1149, + "tan": 1150, + "dad": 1151, + "ado": 1152, + "fu": 1153, + "cia": 1154, + "mente": 1155, + "sus": 1156, + "tar": 1157, + "za": 1158, + "ba": 1159, + "pero": 1160, + "sin": 1161, + "lla": 1162, + "án": 1163, + "ia": 1164, + "ran": 1165, + "ga": 1166, + "yo": 1167, + "tos": 1168, + "cos": 1169, + "ya": 1170, + "ones": 1171, + "había": 1172, + "hi": 1173, + "esta": 1174, + "mas": 1175, + "tor": 1176, + "aban": 1177, + "dor": 1178, + "ían": 1179, + "tas": 1180, + "én": 1181, + "endo": 1182, + "aque": 1183, + "ero": 1184, + "io": 1185, + "qué": 1186, + "cab": 1187, + "tal": 1188, + "señ": 1189, + "ora": 1190, + "todo": 1191, + "sal": 1192, + "cuando": 1193, + "gun": 1194, + "bu": 1195, + "ras": 1196, + "esto": 1197, + "pare": 1198, + "él": 1199, + "tras": 1200, + "jos": 1201, + "mien": 1202, + "pue": 1203, + "cre": 1204, + "pon": 1205, + "día": 1206, + "tros": 1207, + "sab": 1208, + "sobre": 1209, + "ese": 1210, + "mbre": 1211, + "eron": 1212, + "añ": 1213, + "ido": 1214, + "porque": 1215, + "ella": 1216, + "cen": 1217, + "muy": 1218, + "cal": 1219, + "este": 1220, + "has": 1221, + "có": 1222, + "gra": 1223, + "ros": 1224, + "aquel": 1225, + "dijo": 1226, + "cía": 1227, + "zo": 1228, + "ciones": 1229, + "mbi": 1230, + "elo": 1231, + "tó": 1232, + "ina": 1233, + "todos": 1234, + "tien": 1235, + "estaba": 1236, + "deci": 1237, + "cio": 1238, + "ño": 1239, + "lor": 1240, + "nues": 1241, + "medi": 1242, + "len": 1243, + "vida": 1244, + "ali": 1245, + "pues": 1246, + "ales": 1247, + "vol": 1248, + "mí": 1249, + "rar": 1250, + "cion": 1251, + "hasta": 1252, + "señor": 1253, + "cono": 1254, + "ah": 1255, + "dios": 1256, + "esa": 1257, + "ún": 1258, + "var": 1259, + "san": 1260, + "gui": 1261, + "otros": 1262, + "tado": 1263, + "buen": 1264, + "ña": 1265, + "tiemp": 1266, + "hacer": 1267, + "jer": 1268, + "vu": 1269, + "ana": 1270, + "así": 1271, + "antes": 1272, + "vez": 1273, + "miento": 1274, + "jar": 1275, + "lab": 1276, + "casa": 1277, + "eso": 1278, + "ego": 1279, + "dió": 1280, + "está": 1281, + "encia": 1282, + "eli": 1283, + "ías": 1284, + "tiempo": 1285, + "zar": 1286, + "van": 1287, + "mun": 1288, + "erta": 1289, + "tambi": 1290, + "sí": 1291, + "aun": 1292, + "mismo": 1293, + "entes": 1294, + "mano": 1295, + "ele": 1296, + "nada": 1297, + "segu": 1298, + "mej": 1299, + "erra": 1300, + "tir": 1301, + "uno": 1302, + "donde": 1303, + "toda": 1304, + "desde": 1305, + "también": 1306, + "cuer": 1307, + "hombre": 1308, + "otro": 1309, + "lib": 1310, + "trar": 1311, + "cual": 1312, + "hay": 1313, + "cada": 1314, + "taba": 1315, + "mento": 1316, + "tenía": 1317, + "quer": 1318, + "eran": 1319, + "siemp": 1320, + "siempre": 1321, + "erto": 1322, + "quí": 1323, + "gos": 1324, + "pués": 1325, + "ellos": 1326, + "después": 1327, + "nue": 1328, + "llo": 1329, + "inter": 1330, + "cómo": 1331, + "ahora": 1332, + "uste": 1333, + "traba": 1334, + "lado": 1335, + "ino": 1336, + "poco": 1337, + "erte": 1338, + "mujer": 1339, + "quier": 1340, + "algun": 1341, + "fue": 1342, + "ojos": 1343, + "enton": 1344, + "vos": 1345, + "esper": 1346, + "much": 1347, + "otra": 1348, + "az": 1349, + "eza": 1350, + "aquí": 1351, + "cias": 1352, + "gua": 1353, + "mucho": 1354, + "decir": 1355, + "esti": 1356, + "idad": 1357, + "algo": 1358, + "ocu": 1359, + "entonces": 1360, + "dido": 1361, + "entos": 1362, + "gri": 1363, + "dado": 1364, + "ios": 1365, + "dose": 1366, + "usted": 1367, + "quien": 1368, + "ami": 1369, + "unto": 1370, + "mejor": 1371, + "bas": 1372, + "solo": 1373, + "pregun": 1374, + "tur": 1375, + "alg": 1376, + "todas": 1377, + "parte": 1378, + "emb": 1379, + "cto": 1380, + "mundo": 1381, + "tiene": 1382, + "tante": 1383, + "palab": 1384, + "tran": 1385, + "aquella": 1386, + "cios": 1387, + "aunque": 1388, + "cuen": 1389, + "tener": 1390, + "fun": 1391, + "respon": 1392, + "allí": 1393, + "xi": 1394, + "han": 1395, + "pens": 1396, + "contra": 1397, + "tura": 1398, + "val": 1399, + "dio": 1400, + "tanto": 1401, + "camin": 1402, + "mó": 1403, + "esp": 1404, + "ada": 1405, + "ío": 1406, + "hacia": 1407, + "dej": 1408, + "estar": 1409, + "ión": 1410, + "gas": 1411, + "vas": 1412, + "noche": 1413, + "ér": 1414, + "años": 1415, + "padre": 1416, + "gus": 1417, + "ár": 1418, + "sino": 1419, + "manos": 1420, + "cido": 1421, + "estu": 1422, + "hubi": 1423, + "vir": 1424, + "bri": 1425, + "raz": 1426, + "chi": 1427, + "puede": 1428, + "menos": 1429, + "habi": 1430, + "homb": 1431, + "neces": 1432, + "may": 1433, + "eros": 1434, + "ría": 1435, + "hecho": 1436, + "escu": 1437, + "lti": 1438, + "ándo": 1439, + "bus": 1440, + "cosas": 1441, + "tú": 1442, + "espa": 1443, + "reci": 1444, + "ctor": 1445, + "prim": 1446, + "dia": 1447, + "dese": 1448, + "mientras": 1449, + "hor": 1450, + "fuer": 1451, + "ida": 1452, + "posi": 1453, + "lante": 1454, + "ano": 1455, + "estas": 1456, + "pli": 1457, + "luego": 1458, + "sión": 1459, + "cin": 1460, + "tierra": 1461, + "guar": 1462, + "cado": 1463, + "encon": 1464, + "pren": 1465, + "mayor": 1466, + "fal": 1467, + "ð": 1468, + "ħ": 1469, + "ň": 1470, + "ə": 1471, + "θ": 1472, + "’": 1473, + "“": 1474, + "”": 1475, + "zi": 1476, + "gli": 1477, + "tto": 1478, + "ono": 1479, + "nel": 1480, + "tti": 1481, + "della": 1482, + "zione": 1483, + "tta": 1484, + "tà": 1485, + "uo": 1486, + "come": 1487, + "alla": 1488, + "oni": 1489, + "ggi": 1490, + "ssi": 1491, + "più": 1492, + "ini": 1493, + "bb": 1494, + "sto": 1495, + "sono": 1496, + "eri": 1497, + "sse": 1498, + "sc": 1499, + "sul": 1500, + "vano": 1501, + "sti": 1502, + "suo": 1503, + "cchi": 1504, + "zza": 1505, + "anche": 1506, + "tte": 1507, + "sci": 1508, + "col": 1509, + "sso": 1510, + "ssa": 1511, + "dei": 1512, + "aveva": 1513, + "zz": 1514, + "amo": 1515, + "gno": 1516, + "sua": 1517, + "ria": 1518, + "sì": 1519, + "ché": 1520, + "dal": 1521, + "ona": 1522, + "spe": 1523, + "gni": 1524, + "tt": 1525, + "delle": 1526, + "questo": 1527, + "nella": 1528, + "dere": 1529, + "anno": 1530, + "dell": 1531, + "uni": 1532, + "bbe": 1533, + "anti": 1534, + "ene": 1535, + "gio": 1536, + "uto": 1537, + "qual": 1538, + "glia": 1539, + "quando": 1540, + "tutto": 1541, + "glio": 1542, + "zioni": 1543, + "cam": 1544, + "esso": 1545, + "ss": 1546, + "mol": 1547, + "loro": 1548, + "perché": 1549, + "cosa": 1550, + "due": 1551, + "poi": 1552, + "sco": 1553, + "cco": 1554, + "gna": 1555, + "tem": 1556, + "prima": 1557, + "così": 1558, + "essere": 1559, + "ani": 1560, + "bra": 1561, + "rio": 1562, + "anco": 1563, + "cui": 1564, + "spi": 1565, + "via": 1566, + "gior": 1567, + "bile": 1568, + "ggio": 1569, + "mai": 1570, + "tare": 1571, + "indi": 1572, + "rebbe": 1573, + "senza": 1574, + "zio": 1575, + "tutti": 1576, + "stato": 1577, + "zia": 1578, + "dalla": 1579, + "mia": 1580, + "vita": 1581, + "quella": 1582, + "qua": 1583, + "dove": 1584, + "allo": 1585, + "sempre": 1586, + "zzo": 1587, + "sia": 1588, + "dopo": 1589, + "porta": 1590, + "ccia": 1591, + "erano": 1592, + "anni": 1593, + "chia": 1594, + "enza": 1595, + "propri": 1596, + "anda": 1597, + "cca": 1598, + "occhi": 1599, + "questa": 1600, + "ffi": 1601, + "ron": 1602, + "mio": 1603, + "ris": 1604, + "ogni": 1605, + "rin": 1606, + "far": 1607, + "menti": 1608, + "ancora": 1609, + "fatto": 1610, + "mani": 1611, + "senti": 1612, + "pra": 1613, + "tempo": 1614, + "essi": 1615, + "bbi": 1616, + "lare": 1617, + "pers": 1618, + "sor": 1619, + "anza": 1620, + "pie": 1621, + "verso": 1622, + "altro": 1623, + "tato": 1624, + "cato": 1625, + "ato": 1626, + "volta": 1627, + "cc": 1628, + "fare": 1629, + "ciò": 1630, + "bili": 1631, + "nuo": 1632, + "quello": 1633, + "colo": 1634, + "ppo": 1635, + "trova": 1636, + "ore": 1637, + "rono": 1638, + "molto": 1639, + "almente": 1640, + "sca": 1641, + "vole": 1642, + "tali": 1643, + "sulla": 1644, + "sce": 1645, + "meno": 1646, + "anto": 1647, + "pun": 1648, + "stu": 1649, + "capi": 1650, + "giu": 1651, + "mini": 1652, + "pia": 1653, + "lavo": 1654, + "vero": 1655, + "rsi": 1656, + "altri": 1657, + "scia": 1658, + "suoi": 1659, + "glie": 1660, + "sotto": 1661, + "bene": 1662, + "scri": 1663, + "tale": 1664, + "degli": 1665, + "alc": 1666, + "uomo": 1667, + "pel": 1668, + "pote": 1669, + "essa": 1670, + "scu": 1671, + "signo": 1672, + "stro": 1673, + "uti": 1674, + "sione": 1675, + "gre": 1676, + "fini": 1677, + "lun": 1678, + "esi": 1679, + "passa": 1680, + "rà": 1681, + "mentre": 1682, + "hanno": 1683, + "usci": 1684, + "gia": 1685, + "già": 1686, + "mina": 1687, + "tica": 1688, + "giorno": 1689, + "esse": 1690, + "modo": 1691, + "spa": 1692, + "proprio": 1693, + "ori": 1694, + "contro": 1695, + "stru": 1696, + "diven": 1697, + "disse": 1698, + "rato": 1699, + "noi": 1700, + "vere": 1701, + "può": 1702, + "dice": 1703, + "cci": 1704, + "secon": 1705, + "ccio": 1706, + "qualche": 1707, + "tutta": 1708, + "gg": 1709, + "mondo": 1710, + "forma": 1711, + "mma": 1712, + "pensa": 1713, + "deva": 1714, + "fosse": 1715, + "sopra": 1716, + "tamente": 1717, + "ness": 1718, + "quanto": 1719, + "raga": 1720, + "unque": 1721, + "care": 1722, + "stre": 1723, + "grande": 1724, + "picco": 1725, + "guarda": 1726, + "nell": 1727, + "possi": 1728, + "presen": 1729, + "rò": 1730, + "paro": 1731, + "tua": 1732, + "vin": 1733, + "ane": 1734, + "stesso": 1735, + "dav": 1736, + "nei": 1737, + "nelle": 1738, + "ghi": 1739, + "pio": 1740, + "lato": 1741, + "sid": 1742, + "fine": 1743, + "fuo": 1744, + "quasi": 1745, + "ulti": 1746, + "ito": 1747, + "sue": 1748, + "fil": 1749, + "allora": 1750, + "veni": 1751, + "tano": 1752, + "ello": 1753, + "ão": 1754, + "não": 1755, + "uma": 1756, + "ela": 1757, + "lh": 1758, + "ção": 1759, + "cê": 1760, + "inha": 1761, + "você": 1762, + "ec": 1763, + "dade": 1764, + "ao": 1765, + "ram": 1766, + "vel": 1767, + "ém": 1768, + "pode": 1769, + "estava": 1770, + "isso": 1771, + "mui": 1772, + "faz": 1773, + "ões": 1774, + "pes": 1775, + "ix": 1776, + "sim": 1777, + "olh": 1778, + "isa": 1779, + "ên": 1780, + "tinha": 1781, + "meu": 1782, + "são": 1783, + "minha": 1784, + "muito": 1785, + "foi": 1786, + "bem": 1787, + "diz": 1788, + "parec": 1789, + "ço": 1790, + "pesso": 1791, + "pois": 1792, + "mesmo": 1793, + "ções": 1794, + "seus": 1795, + "até": 1796, + "ência": 1797, + "lhe": 1798, + "tiv": 1799, + "mã": 1800, + "só": 1801, + "tão": 1802, + "tudo": 1803, + "então": 1804, + "inda": 1805, + "bal": 1806, + "indo": 1807, + "ndo": 1808, + "já": 1809, + "vam": 1810, + "eito": 1811, + "depois": 1812, + "mel": 1813, + "lha": 1814, + "ainda": 1815, + "fazer": 1816, + "pou": 1817, + "pergun": 1818, + "deix": 1819, + "tamb": 1820, + "ala": 1821, + "pelo": 1822, + "também": 1823, + "fica": 1824, + "prec": 1825, + "eles": 1826, + "havia": 1827, + "lá": 1828, + "nas": 1829, + "gem": 1830, + "mem": 1831, + "ós": 1832, + "deu": 1833, + "eiro": 1834, + "..": 1835, + "assim": 1836, + "ior": 1837, + "har": 1838, + "aqui": 1839, + "cul": 1840, + "sar": 1841, + "outra": 1842, + "olhos": 1843, + "ima": 1844, + "mim": 1845, + "ago": 1846, + "pessoas": 1847, + "eram": 1848, + "eira": 1849, + "pela": 1850, + "coisa": 1851, + "mão": 1852, + "conh": 1853, + "agora": 1854, + "iam": 1855, + "há": 1856, + "suas": 1857, + "guém": 1858, + "cabe": 1859, + "nem": 1860, + "ível": 1861, + "consegu": 1862, + "trabal": 1863, + "lev": 1864, + "lem": 1865, + "vai": 1866, + "tei": 1867, + "pró": 1868, + "quem": 1869, + "onde": 1870, + "cabeça": 1871, + "nunca": 1872, + "mentos": 1873, + "hum": 1874, + "dele": 1875, + "verdade": 1876, + "tá": 1877, + "hos": 1878, + "algum": 1879, + "dizer": 1880, + "penas": 1881, + "nós": 1882, + "enquanto": 1883, + "outro": 1884, + "lho": 1885, + "melhor": 1886, + "primei": 1887, + "iu": 1888, + "apenas": 1889, + "estou": 1890, + "conte": 1891, + "homem": 1892, + "dois": 1893, + "ças": 1894, + "pouco": 1895, + "senhor": 1896, + "tando": 1897, + "espera": 1898, + "pai": 1899, + "rios": 1900, + "baix": 1901, + "ase": 1902, + "isas": 1903, + "hora": 1904, + "ficar": 1905, + "seja": 1906, + "ân": 1907, + "clar": 1908, + "inc": 1909, + "fos": 1910, + "ouvi": 1911, + "vem": 1912, + "tava": 1913, + "ário": 1914, + "sos": 1915, + "inho": 1916, + "rando": 1917, + "ês": 1918, + "coisas": 1919, + "aconte": 1920, + "lher": 1921, + "anos": 1922, + "talvez": 1923, + "estão": 1924, + "liv": 1925, + "outros": 1926, + "qualquer": 1927, + "gou": 1928, + "lí": 1929, + "tivesse": 1930, + "rado": 1931, + "precisa": 1932, + "mãe": 1933, + "dela": 1934, + "entra": 1935, + "maior": 1936, + "noite": 1937, + "tiva": 1938, + "pala": 1939, + "ração": 1940, + "deus": 1941, + "sas": 1942, + "inte": 1943, + "fei": 1944, + "palav": 1945, + "trás": 1946, + "cidade": 1947, + "lugar": 1948, + "vezes": 1949, + "encontra": 1950, + "tru": 1951, + "eci": 1952, + "ın": 1953, + "bir": 1954, + "yor": 1955, + "ek": 1956, + "dı": 1957, + "ey": 1958, + "tı": 1959, + "mı": 1960, + "iz": 1961, + "ır": 1962, + "gö": 1963, + "sı": 1964, + "bil": 1965, + "lı": 1966, + "üz": 1967, + "iç": 1968, + "iy": 1969, + "ım": 1970, + "uz": 1971, + "cak": 1972, + "iş": 1973, + "ını": 1974, + "iyor": 1975, + "baş": 1976, + "dü": 1977, + "değ": 1978, + "kar": 1979, + "ev": 1980, + "öy": 1981, + "bun": 1982, + "yap": 1983, + "sun": 1984, + "gör": 1985, + "yı": 1986, + "ki": 1987, + "ara": 1988, + "alı": 1989, + "onu": 1990, + "çı": 1991, + "şey": 1992, + "sın": 1993, + "kı": 1994, + "kad": 1995, + "ağ": 1996, + "değil": 1997, + "ük": 1998, + "çok": 1999, + "şı": 2000, + "ül": 2001, + "için": 2002, + "eye": 2003, + "oldu": 2004, + "mış": 2005, + "kal": 2006, + "mek": 2007, + "öyle": 2008, + "yordu": 2009, + "yüz": 2010, + "miş": 2011, + "mak": 2012, + "ola": 2013, + "yan": 2014, + "cek": 2015, + "yorum": 2016, + "bak": 2017, + "üm": 2018, + "ları": 2019, + "oğ": 2020, + "kadar": 2021, + "arı": 2022, + "ında": 2023, + "gün": 2024, + "yok": 2025, + "yer": 2026, + "dım": 2027, + "daha": 2028, + "ına": 2029, + "dim": 2030, + "bilir": 2031, + "iki": 2032, + "siz": 2033, + "diğ": 2034, + "bü": 2035, + "düş": 2036, + "üç": 2037, + "unu": 2038, + "aman": 2039, + "fak": 2040, + "ede": 2041, + "sonra": 2042, + "hiç": 2043, + "aki": 2044, + "ğı": 2045, + "bul": 2046, + "maz": 2047, + "anla": 2048, + "bura": 2049, + "geç": 2050, + "maya": 2051, + "konu": 2052, + "din": 2053, + "tek": 2054, + "zaman": 2055, + "eler": 2056, + "öz": 2057, + "dır": 2058, + "gibi": 2059, + "şa": 2060, + "leri": 2061, + "kim": 2062, + "ku": 2063, + "fakat": 2064, + "yar": 2065, + "göz": 2066, + "cı": 2067, + "yorsun": 2068, + "bek": 2069, + "inde": 2070, + "pek": 2071, + "bunu": 2072, + "lik": 2073, + "iler": 2074, + "edi": 2075, + "öl": 2076, + "sür": 2077, + "sır": 2078, + "çık": 2079, + "sıl": 2080, + "alar": 2081, + "kes": 2082, + "yak": 2083, + "çek": 2084, + "yıl": 2085, + "ecek": 2086, + "ız": 2087, + "git": 2088, + "kap": 2089, + "ama": 2090, + "ıl": 2091, + "ların": 2092, + "biz": 2093, + "tır": 2094, + "oy": 2095, + "ancak": 2096, + "doğ": 2097, + "bana": 2098, + "şim": 2099, + "başla": 2100, + "lü": 2101, + "madı": 2102, + "beni": 2103, + "yük": 2104, + "lık": 2105, + "beş": 2106, + "nasıl": 2107, + "tık": 2108, + "tür": 2109, + "daki": 2110, + "ceğ": 2111, + "zı": 2112, + "iyi": 2113, + "dok": 2114, + "benim": 2115, + "cağ": 2116, + "yen": 2117, + "şu": 2118, + "mez": 2119, + "düşün": 2120, + "kendi": 2121, + "şimdi": 2122, + "yol": 2123, + "yu": 2124, + "iste": 2125, + "sek": 2126, + "mam": 2127, + "söyle": 2128, + "dik": 2129, + "kur": 2130, + "olduğ": 2131, + "sını": 2132, + "biliyor": 2133, + "kan": 2134, + "yal": 2135, + "meye": 2136, + "muş": 2137, + "kaç": 2138, + "iye": 2139, + "tü": 2140, + "ef": 2141, + "tım": 2142, + "evet": 2143, + "yet": 2144, + "burada": 2145, + "tim": 2146, + "biraz": 2147, + "kor": 2148, + "doğru": 2149, + "inin": 2150, + "kız": 2151, + "diye": 2152, + "dör": 2153, + "etti": 2154, + "onun": 2155, + "isti": 2156, + "ği": 2157, + "sana": 2158, + "üş": 2159, + "arka": 2160, + "hayır": 2161, + "karşı": 2162, + "ile": 2163, + "hak": 2164, + "ıyor": 2165, + "neden": 2166, + "sev": 2167, + "sız": 2168, + "çocu": 2169, + "çalı": 2170, + "olur": 2171, + "bır": 2172, + "gir": 2173, + "ise": 2174, + "ih": 2175, + "kır": 2176, + "dön": 2177, + "böyle": 2178, + "seni": 2179, + "!\"": 2180, + "dört": 2181, + "söy": 2182, + "oş": 2183, + "musun": 2184, + "laş": 2185, + "ip": 2186, + "kay": 2187, + "hem": 2188, + "büyük": 2189, + "aç": 2190, + "bırak": 2191, + "misin": 2192, + "söz": 2193, + "değiş": 2194, + "ünü": 2195, + "gül": 2196, + "kö": 2197, + "karı": 2198, + "tamam": 2199, + "olu": 2200, + "yeni": 2201, + "lam": 2202, + "mıştı": 2203, + "yaş": 2204, + "iniz": 2205, + "kadın": 2206, + "bunun": 2207, + "mey": 2208, + "altı": 2209, + "yi": 2210, + "inden": 2211, + "senin": 2212, + "yat": 2213, + "top": 2214, + "isi": 2215, + "dün": 2216, + "hiçbir": 2217, + "yon": 2218, + "dın": 2219, + "tün": 2220, + "başka": 2221, + "hep": 2222, + "irmi": 2223, + "devam": 2224, + "olacak": 2225, + "artık": 2226, + "durum": 2227, + "imiz": 2228, + "üzel": 2229, + "lerini": 2230, + "sağ": 2231, + "gerek": 2232, + "yirmi": 2233, + "şek": 2234, + "bağ": 2235, + "lara": 2236, + "yür": 2237, + "ması": 2238, + "katı": 2239, + "dedi": 2240, + "gü": 2241, + "sorun": 2242, + "üne": 2243, + "mız": 2244, + "yapı": 2245, + "mil": 2246, + "ğını": 2247, + "tara": 2248, + "vardı": 2249, + "konuş": 2250, + "arak": 2251, + "larak": 2252, + "çocuk": 2253, + "bütün": 2254, + "ley": 2255, + "dür": 2256, + "güzel": 2257, + "ayı": 2258, + "yapa": 2259, + "nı": 2260, + "ayr": 2261, + "öne": 2262, + "yordum": 2263, + "ban": 2264, + "i̇ş": 2265, + "dum": 2266, + "yorlar": 2267, + "larını": 2268, + "çıkar": 2269, + "zan": 2270, + "seç": 2271, + "liyor": 2272, + "tak": 2273, + "şık": 2274, + "tekrar": 2275, + "aş": 2276, + "eş": 2277, + "mişti": 2278, + "kin": 2279, + "imi": 2280, + "eğ": 2281, + "gidi": 2282, + "leş": 2283, + "başladı": 2284, + "gide": 2285, + "otur": 2286, + "dde": 2287, + "ından": 2288, + "üzer": 2289, + "ının": 2290, + "nız": 2291, + "uy": 2292, + "yedi": 2293, + "kat": 2294, + "olarak": 2295, + "ladı": 2296, + "yalnız": 2297, + "bah": 2298, + "iyet": 2299, + "sak": 2300, + "açık": 2301, + "sında": 2302, + "...": 2303, + "insan": 2304, + "aynı": 2305, + "eder": 2306, + "istan": 2307, + "uzun": 2308, + "geri": 2309, + "erek": 2310, + "olan": 2311, + "gerçek": 2312, + "alan": 2313, + "dış": 2314, + "alık": 2315, + "fark": 2316, + "üst": 2317, + "sade": 2318, + "kiş": 2319, + "ldı": 2320, + "zor": 2321, + "etir": 2322, + "herkes": 2323, + "ömer": 2324, + "unda": 2325, + "haf": 2326, + "buna": 2327, + "ydı": 2328, + "peki": 2329, + "adam": 2330, + "haz": 2331, + "sına": 2332, + "kapı": 2333, + "görüş": 2334, + "sadece": 2335, + "aldı": 2336, + "geldi": 2337, + "rz": 2338, + "sz": 2339, + "cz": 2340, + "ię": 2341, + "dz": 2342, + "ał": 2343, + "się": 2344, + "rze": 2345, + "że": 2346, + "wy": 2347, + "rzy": 2348, + "ła": 2349, + "ło": 2350, + "ny": 2351, + "dzie": 2352, + "dzi": 2353, + "czy": 2354, + "cie": 2355, + "prze": 2356, + "dy": 2357, + "kie": 2358, + "ry": 2359, + "ją": 2360, + "ów": 2361, + "przy": 2362, + "mie": 2363, + "szy": 2364, + "cze": 2365, + "bie": 2366, + "cy": 2367, + "nia": 2368, + "ści": 2369, + "sze": 2370, + "jest": 2371, + "ży": 2372, + "ną": 2373, + "któ": 2374, + "ała": 2375, + "mnie": 2376, + "ły": 2377, + "cza": 2378, + "jak": 2379, + "roz": 2380, + "ró": 2381, + "zna": 2382, + "łu": 2383, + "ść": 2384, + "wia": 2385, + "wszy": 2386, + "spo": 2387, + "gdy": 2388, + "wał": 2389, + "wię": 2390, + "łem": 2391, + "ję": 2392, + "sk": 2393, + "rę": 2394, + "dob": 2395, + "już": 2396, + "bę": 2397, + "ałem": 2398, + "sza": 2399, + "pod": 2400, + "dla": 2401, + "pan": 2402, + "nę": 2403, + "może": 2404, + "śli": 2405, + "ało": 2406, + "lko": 2407, + "nych": 2408, + "powie": 2409, + "cię": 2410, + "tylko": 2411, + "naj": 2412, + "tego": 2413, + "ski": 2414, + "nego": 2415, + "wszyst": 2416, + "szcze": 2417, + "jed": 2418, + "jej": 2419, + "two": 2420, + "ąd": 2421, + "śmy": 2422, + "czę": 2423, + "wać": 2424, + "jego": 2425, + "ża": 2426, + "sy": 2427, + "praw": 2428, + "tym": 2429, + "który": 2430, + "ały": 2431, + "trze": 2432, + "niej": 2433, + "nym": 2434, + "gło": 2435, + "jąc": 2436, + "mówi": 2437, + "ska": 2438, + "nej": 2439, + "słu": 2440, + "wła": 2441, + "będzie": 2442, + "dę": 2443, + "pó": 2444, + "bez": 2445, + "nic": 2446, + "pła": 2447, + "ście": 2448, + "są": 2449, + "trzy": 2450, + "kiem": 2451, + "był": 2452, + "mog": 2453, + "robi": 2454, + "tam": 2455, + "mię": 2456, + "zy": 2457, + "pew": 2458, + "myś": 2459, + "przed": 2460, + "sko": 2461, + "które": 2462, + "lę": 2463, + "wsze": 2464, + "ąc": 2465, + "było": 2466, + "sobie": 2467, + "py": 2468, + "cią": 2469, + "jeszcze": 2470, + "tę": 2471, + "czas": 2472, + "szę": 2473, + "gł": 2474, + "kę": 2475, + "czu": 2476, + "przez": 2477, + "sło": 2478, + "wz": 2479, + "kto": 2480, + "ków": 2481, + "czo": 2482, + "liśmy": 2483, + "więc": 2484, + "rą": 2485, + "wó": 2486, + "rza": 2487, + "ności": 2488, + "wet": 2489, + "nął": 2490, + "śmie": 2491, + "nawet": 2492, + "musi": 2493, + "swo": 2494, + "tej": 2495, + "wą": 2496, + "wu": 2497, + "wią": 2498, + "niu": 2499, + "czą": 2500, + "dzo": 2501, + "skie": 2502, + "jeśli": 2503, + "czego": 2504, + "chy": 2505, + "dł": 2506, + "tych": 2507, + "bym": 2508, + "żo": 2509, + "eś": 2510, + "sią": 2511, + "kiedy": 2512, + "wró": 2513, + "dze": 2514, + "dro": 2515, + "rów": 2516, + "pani": 2517, + "kul": 2518, + "nad": 2519, + "chwi": 2520, + "nim": 2521, + "być": 2522, + "chodzi": 2523, + "nio": 2524, + "dobrze": 2525, + "teraz": 2526, + "wokul": 2527, + "coś": 2528, + "kł": 2529, + "pier": 2530, + "gdzie": 2531, + "dzy": 2532, + "pię": 2533, + "dź": 2534, + "ką": 2535, + "gó": 2536, + "zda": 2537, + "chce": 2538, + "stę": 2539, + "świa": 2540, + "wszystko": 2541, + "peł": 2542, + "wiem": 2543, + "wiel": 2544, + "każ": 2545, + "rzu": 2546, + "sły": 2547, + "jedna": 2548, + "myśl": 2549, + "mój": 2550, + "jestem": 2551, + "óż": 2552, + "miej": 2553, + "moż": 2554, + "kła": 2555, + "resz": 2556, + "dłu": 2557, + "stwo": 2558, + "nię": 2559, + "masz": 2560, + "żeby": 2561, + "niem": 2562, + "jakie": 2563, + "sty": 2564, + "nią": 2565, + "wej": 2566, + "oj": 2567, + "sła": 2568, + "ność": 2569, + "zło": 2570, + "szczę": 2571, + "lej": 2572, + "wego": 2573, + "cał": 2574, + "dział": 2575, + "kich": 2576, + "dza": 2577, + "dzię": 2578, + "oczy": 2579, + "zosta": 2580, + "czło": 2581, + "nam": 2582, + "kil": 2583, + "szu": 2584, + "wę": 2585, + "miał": 2586, + "strze": 2587, + "cej": 2588, + "ej": 2589, + "znaj": 2590, + "dać": 2591, + "miejs": 2592, + "kró": 2593, + "kry": 2594, + "bardzo": 2595, + "śnie": 2596, + "lą": 2597, + "gie": 2598, + "ciebie": 2599, + "dni": 2600, + "potrze": 2601, + "wokulski": 2602, + "uwa": 2603, + "umie": 2604, + "jednak": 2605, + "kra": 2606, + "wróci": 2607, + "człowie": 2608, + "czyć": 2609, + "była": 2610, + "żeli": 2611, + "mę": 2612, + "cę": 2613, + "zrobi": 2614, + "mogę": 2615, + "prowa": 2616, + "rem": 2617, + "niech": 2618, + "cznie": 2619, + "kro": 2620, + "tą": 2621, + "chci": 2622, + "bro": 2623, + "dzieć": 2624, + "szą": 2625, + "pad": 2626, + "trz": 2627, + "jem": 2628, + "tów": 2629, + "dru": 2630, + "taj": 2631, + "rzekł": 2632, + "niego": 2633, + "takie": 2634, + "wała": 2635, + "towa": 2636, + "kapła": 2637, + "widzi": 2638, + "podob": 2639, + "dzę": 2640, + "tał": 2641, + "stęp": 2642, + "bą": 2643, + "poko": 2644, + "wem": 2645, + "gę": 2646, + "aby": 2647, + "albo": 2648, + "spra": 2649, + "zno": 2650, + "smo": 2651, + "jesz": 2652, + "księ": 2653, + "jesteś": 2654, + "poz": 2655, + "nigdy": 2656, + "ksią": 2657, + "cóż": 2658, + "ws": 2659, + "pow": 2660, + "tka": 2661, + "świe": 2662, + "szka": 2663, + "samo": 2664, + "sł": 2665, + "rzę": 2666, + "nale": 2667, + "chcesz": 2668, + "nik": 2669, + "pę": 2670, + "chyba": 2671, + "ciąg": 2672, + "jący": 2673, + "woj": 2674, + "nasze": 2675, + "mniej": 2676, + "więcej": 2677, + "zwy": 2678, + "osta": 2679, + "waż": 2680, + "śmier": 2681, + "wier": 2682, + "dzą": 2683, + "zaś": 2684, + "gdyby": 2685, + "jaki": 2686, + "wol": 2687, + "win": 2688, + "dą": 2689, + "ścia": 2690, + "rozma": 2691, + "wal": 2692, + "panie": 2693, + "star": 2694, + "kaz": 2695, + "jeżeli": 2696, + "wra": 2697, + "koń": 2698, + "siebie": 2699, + "znowu": 2700, + "czem": 2701, + "stwa": 2702, + "isto": 2703, + "pół": 2704, + "dał": 2705, + "kobie": 2706, + "ałam": 2707, + "wych": 2708, + "cesa": 2709, + "nich": 2710, + "zawsze": 2711, + "dzić": 2712, + "też": 2713, + "lepie": 2714, + "proszę": 2715, + "kre": 2716, + "twa": 2717, + "łą": 2718, + "chu": 2719, + "cą": 2720, + "prz": 2721, + "łe": 2722, + "szedł": 2723, + "odpowie": 2724, + "myśli": 2725, + "świą": 2726, + "ź": 2727, + "ł": 2728, + "&": 2729, + "=": 2730, + "ă": 2731, + "đ": 2732, + "ţ": 2733, + "–": 2734, + "‘": 2735, + "ij": 2736, + "aa": 2737, + "een": 2738, + "het": 2739, + "aar": 2740, + "oor": 2741, + "ijn": 2742, + "dat": 2743, + "oe": 2744, + "ijk": 2745, + "aan": 2746, + "voor": 2747, + "iet": 2748, + "zijn": 2749, + "niet": 2750, + "oo": 2751, + "moet": 2752, + "heb": 2753, + "uit": 2754, + "wij": 2755, + "aat": 2756, + "lijk": 2757, + "sl": 2758, + "daar": 2759, + "deze": 2760, + "worden": 2761, + "moeten": 2762, + "onder": 2763, + "hebben": 2764, + "ook": 2765, + "ct": 2766, + "nog": 2767, + "aal": 2768, + "eer": 2769, + "bij": 2770, + "mijn": 2771, + "kom": 2772, + "atie": 2773, + "eft": 2774, + "kel": 2775, + "rij": 2776, + "heid": 2777, + "af": 2778, + "stel": 2779, + "maar": 2780, + "wee": 2781, + "heeft": 2782, + "waar": 2783, + "eren": 2784, + "wat": 2785, + "wil": 2786, + "aag": 2787, + "bet": 2788, + "hij": 2789, + "kun": 2790, + "uw": 2791, + "dt": 2792, + "door": 2793, + "tij": 2794, + "ond": 2795, + "geen": 2796, + "gev": 2797, + "veel": 2798, + "naar": 2799, + "aten": 2800, + "kunnen": 2801, + "echt": 2802, + "goe": 2803, + "twee": 2804, + "delijk": 2805, + "uur": 2806, + "toe": 2807, + "meer": 2808, + "onze": 2809, + "tijd": 2810, + "hoe": 2811, + "tot": 2812, + "zou": 2813, + "aak": 2814, + "amen": 2815, + "woor": 2816, + "wordt": 2817, + "gelijk": 2818, + "gaan": 2819, + "ker": 2820, + "eld": 2821, + "hou": 2822, + "zel": 2823, + "tegen": 2824, + "komen": 2825, + "werk": 2826, + "goed": 2827, + "zal": 2828, + "zij": 2829, + "slag": 2830, + "zien": 2831, + "echter": 2832, + "itie": 2833, + "tie": 2834, + "elijk": 2835, + "ische": 2836, + "belan": 2837, + "haar": 2838, + "vr": 2839, + "grijk": 2840, + "doen": 2841, + "land": 2842, + "belangrijk": 2843, + "open": 2844, + "ctie": 2845, + "zelf": 2846, + "mij": 2847, + "iteit": 2848, + "stem": 2849, + "mee": 2850, + "aren": 2851, + "dien": 2852, + "gaat": 2853, + "prob": 2854, + "moe": 2855, + "ullen": 2856, + "zich": 2857, + "daarom": 2858, + "orm": 2859, + "staat": 2860, + "zit": 2861, + "dui": 2862, + "dus": 2863, + "ds": 2864, + "verslag": 2865, + "kelijk": 2866, + "proble": 2867, + "schap": 2868, + "gd": 2869, + "hun": 2870, + "erd": 2871, + "zet": 2872, + "staan": 2873, + "maal": 2874, + "inder": 2875, + "eid": 2876, + "kken": 2877, + "ged": 2878, + "zullen": 2879, + "mensen": 2880, + "jaar": 2881, + "regel": 2882, + "ieder": 2883, + "volgen": 2884, + "geven": 2885, + "even": 2886, + "blij": 2887, + "ië": 2888, + "uwe": 2889, + "maken": 2890, + "oek": 2891, + "nieuwe": 2892, + "baar": 2893, + "andere": 2894, + "ruik": 2895, + "agen": 2896, + "ouw": 2897, + "willen": 2898, + "aakt": 2899, + "hoo": 2900, + "anden": 2901, + "lig": 2902, + "samen": 2903, + "zeer": 2904, + "duidelijk": 2905, + "antwoor": 2906, + "heel": 2907, + "punt": 2908, + "houden": 2909, + "vraag": 2910, + "gele": 2911, + "eens": 2912, + "besch": 2913, + "omen": 2914, + "erg": 2915, + "doel": 2916, + "dag": 2917, + "uren": 2918, + "ings": 2919, + "oren": 2920, + "delen": 2921, + "steun": 2922, + "innen": 2923, + "pol": 2924, + "oon": 2925, + "sn": 2926, + "zonder": 2927, + "nodig": 2928, + "alleen": 2929, + "mid": 2930, + "ragen": 2931, + "iets": 2932, + "versch": 2933, + "gebruik": 2934, + "rouw": 2935, + "stellen": 2936, + "menten": 2937, + "eerste": 2938, + "laat": 2939, + "groot": 2940, + "ood": 2941, + "toch": 2942, + "laten": 2943, + "aard": 2944, + "sle": 2945, + "deel": 2946, + "plaat": 2947, + "ree": 2948, + "betre": 2949, + "lid": 2950, + "uiten": 2951, + "racht": 2952, + "beleid": 2953, + "stie": 2954, + "staten": 2955, + "ggen": 2956, + "reken": 2957, + "alen": 2958, + "ming": 2959, + "mogelijk": 2960, + "grote": 2961, + "altijd": 2962, + "enkel": 2963, + "wik": 2964, + "politie": 2965, + "elk": 2966, + "handel": 2967, + "kwe": 2968, + "maat": 2969, + "elen": 2970, + "vrij": 2971, + "jes": 2972, + "aam": 2973, + "huis": 2974, + "weer": 2975, + "lidstaten": 2976, + "king": 2977, + "kle": 2978, + "bed": 2979, + "geval": 2980, + "wikkel": 2981, + "kwestie": 2982, + "stee": 2983, + "hel": 2984, + "komst": 2985, + "iden": 2986, + "eerd": 2987, + "tweede": 2988, + "probleem": 2989, + "ussen": 2990, + "snel": 2991, + "tig": 2992, + "ult": 2993, + "nemen": 2994, + "commis": 2995, + "verschil": 2996, + "zoek": 2997, + "krij": 2998, + "graag": 2999, + "denk": 3000, + "landen": 3001, + "reden": 3002, + "besl": 3003, + "oeg": 3004, + "beter": 3005, + "heden": 3006, + "mag": 3007, + "boven": 3008, + "cont": 3009, + "fd": 3010, + "hele": 3011, + "vier": 3012, + "gez": 3013, + "kw": 3014, + "aas": 3015, + "ontwikkel": 3016, + "drie": 3017, + "vaak": 3018, + "plaats": 3019, + "gang": 3020, + "ijf": 3021, + "natuur": 3022, + "tussen": 3023, + "bat": 3024, + "komt": 3025, + "wacht": 3026, + "aad": 3027, + "achter": 3028, + "gebie": 3029, + "verk": 3030, + "ligt": 3031, + "nieuw": 3032, + "vand": 3033, + "ý": 3034, + "ď": 3035, + "ě": 3036, + "ř": 3037, + "ť": 3038, + "ů": 3039, + "„": 3040, + "ní": 3041, + "ně": 3042, + "ře": 3043, + "ná": 3044, + "vě": 3045, + "vá": 3046, + "rá": 3047, + "vy": 3048, + "mě": 3049, + "ři": 3050, + "ří": 3051, + "že": 3052, + "jí": 3053, + "vý": 3054, + "ji": 3055, + "dě": 3056, + "če": 3057, + "tě": 3058, + "ky": 3059, + "še": 3060, + "ké": 3061, + "ší": 3062, + "pře": 3063, + "ví": 3064, + "ný": 3065, + "ži": 3066, + "má": 3067, + "cí": 3068, + "zá": 3069, + "ské": 3070, + "dá": 3071, + "byl": 3072, + "tí": 3073, + "pří": 3074, + "při": 3075, + "či": 3076, + "vní": 3077, + "ča": 3078, + "dí": 3079, + "dní": 3080, + "ká": 3081, + "nou": 3082, + "vět": 3083, + "pě": 3084, + "kou": 3085, + "ých": 3086, + "bě": 3087, + "prá": 3088, + "jako": 3089, + "ží": 3090, + "zí": 3091, + "jsou": 3092, + "jsem": 3093, + "lní": 3094, + "cké": 3095, + "vat": 3096, + "před": 3097, + "hla": 3098, + "stá": 3099, + "čí": 3100, + "ši": 3101, + "kla": 3102, + "ště": 3103, + "lou": 3104, + "mů": 3105, + "chá": 3106, + "pů": 3107, + "také": 3108, + "dů": 3109, + "nost": 3110, + "tře": 3111, + "sku": 3112, + "vše": 3113, + "tní": 3114, + "byla": 3115, + "ční": 3116, + "jeho": 3117, + "bý": 3118, + "vání": 3119, + "ných": 3120, + "tři": 3121, + "vz": 3122, + "stře": 3123, + "dva": 3124, + "hle": 3125, + "čá": 3126, + "nosti": 3127, + "vš": 3128, + "hra": 3129, + "jen": 3130, + "slo": 3131, + "však": 3132, + "kdy": 3133, + "bylo": 3134, + "bude": 3135, + "jší": 3136, + "vých": 3137, + "ním": 3138, + "sm": 3139, + "koli": 3140, + "rů": 3141, + "může": 3142, + "není": 3143, + "hod": 3144, + "bí": 3145, + "tý": 3146, + "stě": 3147, + "uje": 3148, + "sá": 3149, + "pět": 3150, + "krá": 3151, + "tom": 3152, + "ství": 3153, + "vně": 3154, + "sed": 3155, + "své": 3156, + "pí": 3157, + "musí": 3158, + "už": 3159, + "tím": 3160, + "jící": 3161, + "jedno": 3162, + "čas": 3163, + "čty": 3164, + "ský": 3165, + "evro": 3166, + "toho": 3167, + "hy": 3168, + "kter": 3169, + "rní": 3170, + "stí": 3171, + "svě": 3172, + "pak": 3173, + "všech": 3174, + "ků": 3175, + "ng": 3176, + "ád": 3177, + "chází": 3178, + "být": 3179, + "první": 3180, + "mno": 3181, + "ského": 3182, + "pá": 3183, + "nebo": 3184, + "kem": 3185, + "sla": 3186, + "ného": 3187, + "zde": 3188, + "další": 3189, + "řa": 3190, + "čtyři": 3191, + "hrá": 3192, + "druh": 3193, + "lně": 3194, + "vla": 3195, + "ských": 3196, + "ško": 3197, + "půso": 3198, + "proto": 3199, + "vů": 3200, + "ská": 3201, + "šest": 3202, + "dně": 3203, + "ještě": 3204, + "mezi": 3205, + "několi": 3206, + "již": 3207, + "čně": 3208, + "slu": 3209, + "zná": 3210, + "sedm": 3211, + "vlá": 3212, + "osm": 3213, + "byly": 3214, + "vám": 3215, + "cký": 3216, + "tech": 3217, + "ději": 3218, + "velmi": 3219, + "leži": 3220, + "vala": 3221, + "lý": 3222, + "tvo": 3223, + "spole": 3224, + "stup": 3225, + "mož": 3226, + "evrop": 3227, + "stal": 3228, + "jde": 3229, + "rodi": 3230, + "její": 3231, + "poli": 3232, + "devět": 3233, + "sme": 3234, + "až": 3235, + "této": 3236, + "tento": 3237, + "kaž": 3238, + "nula": 3239, + "bych": 3240, + "moc": 3241, + "stou": 3242, + "kdo": 3243, + "zd": 3244, + "praco": 3245, + "tomu": 3246, + "ným": 3247, + "živo": 3248, + "zem": 3249, + "násle": 3250, + "sky": 3251, + "jich": 3252, + "měl": 3253, + "děla": 3254, + "jsme": 3255, + "nice": 3256, + "stej": 3257, + "stní": 3258, + "náro": 3259, + "nit": 3260, + "později": 3261, + "tako": 3262, + "nce": 3263, + "čer": 3264, + "ším": 3265, + "něco": 3266, + "vál": 3267, + "řej": 3268, + "krát": 3269, + "ální": 3270, + "asi": 3271, + "které": 3272, + "stav": 3273, + "mají": 3274, + "mys": 3275, + "době": 3276, + "sně": 3277, + "zku": 3278, + "tů": 3279, + "chod": 3280, + "spě": 3281, + "jejich": 3282, + "součas": 3283, + "vali": 3284, + "kte": 3285, + "prů": 3286, + "zení": 3287, + "pat": 3288, + "potře": 3289, + "dnes": 3290, + "zemí": 3291, + "znam": 3292, + "mám": 3293, + "tedy": 3294, + "hlavní": 3295, + "použí": 3296, + "bní": 3297, + "vede": 3298, + "lep": 3299, + "jek": 3300, + "prav": 3301, + "politi": 3302, + "dne": 3303, + "čení": 3304, + "než": 3305, + "děl": 3306, + "čo": 3307, + "cích": 3308, + "sté": 3309, + "dlou": 3310, + "několik": 3311, + "vyu": 3312, + "ckých": 3313, + "nové": 3314, + "čin": 3315, + "dělá": 3316, + "ký": 3317, + "obla": 3318, + "podle": 3319, + "důleži": 3320, + "poku": 3321, + "kone": 3322, + "dý": 3323, + "dvě": 3324, + "žád": 3325, + "nout": 3326, + "tku": 3327, + "tvr": 3328, + "ckého": 3329, + "rov": 3330, + "tele": 3331, + "psa": 3332, + "svět": 3333, + "tivní": 3334, + "dosta": 3335, + "šel": 3336, + "druhé": 3337, + "skou": 3338, + "žo": 3339, + "jedná": 3340, + "význam": 3341, + "problé": 3342, + "publi": 3343, + "ván": 3344, + "odpo": 3345, + "podpo": 3346, + "dle": 3347, + "jaké": 3348, + "šení": 3349, + "vím": 3350, + "během": 3351, + "nachází": 3352, + "slou": 3353, + "pouze": 3354, + "otá": 3355, + "plo": 3356, + "tové": 3357, + "větši": 3358, + "komi": 3359, + "vají": 3360, + "tyto": 3361, + "zápa": 3362, + "změ": 3363, + "moh": 3364, + "více": 3365, + "společ": 3366, + "auto": 3367, + "proti": 3368, + "dět": 3369, + "cháze": 3370, + "žel": 3371, + "«": 3372, + "»": 3373, + "а": 3374, + "б": 3375, + "в": 3376, + "г": 3377, + "д": 3378, + "е": 3379, + "ж": 3380, + "з": 3381, + "и": 3382, + "й": 3383, + "к": 3384, + "л": 3385, + "м": 3386, + "н": 3387, + "о": 3388, + "п": 3389, + "р": 3390, + "с": 3391, + "т": 3392, + "у": 3393, + "ф": 3394, + "х": 3395, + "ц": 3396, + "ч": 3397, + "ш": 3398, + "щ": 3399, + "ъ": 3400, + "ы": 3401, + "ь": 3402, + "э": 3403, + "ю": 3404, + "я": 3405, + "ё": 3406, + "‑": 3407, + "−": 3408, + "ст": 3409, + "ен": 3410, + "но": 3411, + "на": 3412, + "пр": 3413, + "то": 3414, + "по": 3415, + "ра": 3416, + "го": 3417, + "ко": 3418, + "не": 3419, + "во": 3420, + "ва": 3421, + "ет": 3422, + "ер": 3423, + "ни": 3424, + "ел": 3425, + "ит": 3426, + "ны": 3427, + "за": 3428, + "ро": 3429, + "ени": 3430, + "ка": 3431, + "ли": 3432, + "ем": 3433, + "да": 3434, + "об": 3435, + "ла": 3436, + "до": 3437, + "ся": 3438, + "ть": 3439, + "от": 3440, + "ло": 3441, + "ль": 3442, + "ед": 3443, + "со": 3444, + "ми": 3445, + "ре": 3446, + "мо": 3447, + "ци": 3448, + "про": 3449, + "та": 3450, + "это": 3451, + "ки": 3452, + "ру": 3453, + "при": 3454, + "ти": 3455, + "се": 3456, + "ста": 3457, + "вы": 3458, + "мы": 3459, + "ви": 3460, + "бы": 3461, + "ма": 3462, + "ес": 3463, + "ля": 3464, + "сти": 3465, + "ле": 3466, + "что": 3467, + "ме": 3468, + "ри": 3469, + "ча": 3470, + "од": 3471, + "ей": 3472, + "ель": 3473, + "ения": 3474, + "га": 3475, + "ну": 3476, + "си": 3477, + "па": 3478, + "раз": 3479, + "бо": 3480, + "сто": 3481, + "су": 3482, + "са": 3483, + "ду": 3484, + "его": 3485, + "ест": 3486, + "ин": 3487, + "ить": 3488, + "из": 3489, + "же": 3490, + "му": 3491, + "пер": 3492, + "под": 3493, + "ение": 3494, + "сь": 3495, + "ку": 3496, + "пред": 3497, + "ного": 3498, + "ных": 3499, + "вер": 3500, + "те": 3501, + "ной": 3502, + "ции": 3503, + "де": 3504, + "ры": 3505, + "дел": 3506, + "лю": 3507, + "ве": 3508, + "он": 3509, + "мен": 3510, + "ги": 3511, + "ня": 3512, + "бу": 3513, + "пра": 3514, + "все": 3515, + "ется": 3516, + "сть": 3517, + "жа": 3518, + "дол": 3519, + "жи": 3520, + "бе": 3521, + "кон": 3522, + "сл": 3523, + "ши": 3524, + "ди": 3525, + "ств": 3526, + "ско": 3527, + "ные": 3528, + "чи": 3529, + "ют": 3530, + "дер": 3531, + "стра": 3532, + "ты": 3533, + "ход": 3534, + "щи": 3535, + "зо": 3536, + "зна": 3537, + "ности": 3538, + "чес": 3539, + "вля": 3540, + "вать": 3541, + "ор": 3542, + "пол": 3543, + "вет": 3544, + "так": 3545, + "ша": 3546, + "ту": 3547, + "сво": 3548, + "пре": 3549, + "она": 3550, + "итель": 3551, + "ный": 3552, + "сло": 3553, + "как": 3554, + "вл": 3555, + "ность": 3556, + "хо": 3557, + "мож": 3558, + "пе": 3559, + "для": 3560, + "ния": 3561, + "ное": 3562, + "рас": 3563, + "долж": 3564, + "дар": 3565, + "тель": 3566, + "ска": 3567, + "пу": 3568, + "ство": 3569, + "кото": 3570, + "раб": 3571, + "ее": 3572, + "род": 3573, + "эти": 3574, + "соб": 3575, + "ору": 3576, + "жен": 3577, + "ным": 3578, + "ити": 3579, + "ние": 3580, + "ком": 3581, + "дет": 3582, + "сту": 3583, + "гу": 3584, + "пи": 3585, + "меж": 3586, + "ению": 3587, + "тер": 3588, + "работ": 3589, + "воз": 3590, + "ция": 3591, + "кой": 3592, + "щест": 3593, + "гра": 3594, + "зи": 3595, + "ря": 3596, + "между": 3597, + "ства": 3598, + "вс": 3599, + "ело": 3600, + "ше": 3601, + "мер": 3602, + "ба": 3603, + "зы": 3604, + "лу": 3605, + "аль": 3606, + "дей": 3607, + "гла": 3608, + "народ": 3609, + "кти": 3610, + "предста": 3611, + "лся": 3612, + "явля": 3613, + "ски": 3614, + "нов": 3615, + "един": 3616, + "ров": 3617, + "ис": 3618, + "нима": 3619, + "рем": 3620, + "ходи": 3621, + "также": 3622, + "дру": 3623, + "ать": 3624, + "след": 3625, + "гово": 3626, + "ная": 3627, + "ющи": 3628, + "ень": 3629, + "которы": 3630, + "хот": 3631, + "ву": 3632, + "их": 3633, + "ему": 3634, + "чит": 3635, + "важ": 3636, + "орга": 3637, + "чески": 3638, + "ще": 3639, + "ке": 3640, + "ха": 3641, + "пос": 3642, + "том": 3643, + "боль": 3644, + "мне": 3645, + "пас": 3646, + "объ": 3647, + "прав": 3648, + "конф": 3649, + "слу": 3650, + "поддер": 3651, + "стви": 3652, + "наш": 3653, + "лько": 3654, + "стоя": 3655, + "ную": 3656, + "лем": 3657, + "енных": 3658, + "кра": 3659, + "ды": 3660, + "международ": 3661, + "гда": 3662, + "необ": 3663, + "госу": 3664, + "ству": 3665, + "ении": 3666, + "государ": 3667, + "кто": 3668, + "им": 3669, + "чест": 3670, + "рет": 3671, + "вопро": 3672, + "лен": 3673, + "ели": 3674, + "рова": 3675, + "ций": 3676, + "нам": 3677, + "этой": 3678, + "жения": 3679, + "необходи": 3680, + "меня": 3681, + "было": 3682, + "сили": 3683, + "фи": 3684, + "вя": 3685, + "шь": 3686, + "этого": 3687, + "они": 3688, + "органи": 3689, + "безо": 3690, + "проб": 3691, + "име": 3692, + "реш": 3693, + "би": 3694, + "безопас": 3695, + "ются": 3696, + "оста": 3697, + "енно": 3698, + "год": 3699, + "ела": 3700, + "представ": 3701, + "ться": 3702, + "слово": 3703, + "организа": 3704, + "должны": 3705, + "этом": 3706, + "бла": 3707, + "че": 3708, + "чу": 3709, + "благо": 3710, + "этому": 3711, + "врем": 3712, + "спе": 3713, + "ном": 3714, + "ений": 3715, + "спо": 3716, + "нас": 3717, + "нет": 3718, + "зу": 3719, + "вед": 3720, + "еще": 3721, + "сказа": 3722, + "сей": 3723, + "ерен": 3724, + "дан": 3725, + "сам": 3726, + "еля": 3727, + "ран": 3728, + "зыва": 3729, + "является": 3730, + "будет": 3731, + "ктив": 3732, + "тре": 3733, + "деле": 3734, + "мот": 3735, + "конферен": 3736, + "лась": 3737, + "час": 3738, + "сторо": 3739, + "кого": 3740, + "ез": 3741, + "ней": 3742, + "ос": 3743, + "лись": 3744, + "разору": 3745, + "пере": 3746, + "сси": 3747, + "ными": 3748, + "проц": 3749, + "голо": 3750, + "чело": 3751, + "боле": 3752, + "челове": 3753, + "сер": 3754, + "пл": 3755, + "чет": 3756, + "стран": 3757, + "пя": 3758, + "был": 3759, + "кла": 3760, + "тов": 3761, + "жд": 3762, + "дела": 3763, + "ера": 3764, + "уже": 3765, + "совет": 3766, + "ген": 3767, + "безопасности": 3768, + "ца": 3769, + "седа": 3770, + "поз": 3771, + "ответ": 3772, + "проблем": 3773, + "нако": 3774, + "тем": 3775, + "доста": 3776, + "пы": 3777, + "ща": 3778, + "вой": 3779, + "сущест": 3780, + "необходимо": 3781, + "быть": 3782, + "может": 3783, + "дем": 3784, + "чтобы": 3785, + "ек": 3786, + "чер": 3787, + "усили": 3788, + "рес": 3789, + "руд": 3790, + "единенных": 3791, + "доб": 3792, + "дости": 3793, + "ствен": 3794, + "ядер": 3795, + "годня": 3796, + "каза": 3797, + "сегодня": 3798, + "сейчас": 3799, + "только": 3800, + "вод": 3801, + "есь": 3802, + "много": 3803, + "буду": 3804, + "ев": 3805, + "есть": 3806, + "три": 3807, + "общест": 3808, + "явл": 3809, + "высту": 3810, + "ред": 3811, + "счит": 3812, + "сит": 3813, + "делега": 3814, + "лож": 3815, + "этот": 3816, + "фор": 3817, + "клю": 3818, + "возмож": 3819, + "вания": 3820, + "бли": 3821, + "или": 3822, + "вз": 3823, + "наций": 3824, + "ского": 3825, + "приня": 3826, + "пла": 3827, + "оч": 3828, + "иться": 3829, + "сте": 3830, + "наши": 3831, + "которые": 3832, + "ар": 3833, + "имеет": 3834, + "сот": 3835, + "знач": 3836, + "перь": 3837, + "следу": 3838, + "ены": 3839, + "таки": 3840, + "объединенных": 3841, + "стро": 3842, + "теперь": 3843, + "бле": 3844, + "благодар": 3845, + "разв": 3846, + "ан": 3847, + "жива": 3848, + "очень": 3849, + "ят": 3850, + "без": 3851, + "обес": 3852, + "гро": 3853, + "лось": 3854, + "сы": 3855, + "организации": 3856, + "член": 3857, + "того": 3858, + "ональ": 3859, + "жда": 3860, + "всех": 3861, + "свя": 3862, + "более": 3863, + "сов": 3864, + "когда": 3865, + "вот": 3866, + "кре": 3867, + "кры": 3868, + "поэтому": 3869, + "воль": 3870, + "ой": 3871, + "генера": 3872, + "чем": 3873, + "лы": 3874, + "полити": 3875, + "вен": 3876, + "конференции": 3877, + "процес": 3878, + "бя": 3879, + "ите": 3880, + "отно": 3881, + "развити": 3882, + "аф": 3883, + "ющ": 3884, + "вно": 3885, + "мир": 3886, + "нии": 3887, + "кая": 3888, + "ас": 3889, + "ительно": 3890, + "вто": 3891, + "ением": 3892, + "генераль": 3893, + "прот": 3894, + "всем": 3895, + "самбле": 3896, + "ассамбле": 3897, + "ом": 3898, + "зд": 3899, + "смот": 3900, + "реги": 3901, + "чего": 3902, + "однако": 3903, + "усилия": 3904, + "действи": 3905, + "чно": 3906, + "уча": 3907, + "образ": 3908, + "вос": 3909, + "эта": 3910, + "перего": 3911, + "говор": 3912, + "вам": 3913, + "моло": 3914, + "время": 3915, + "дь": 3916, + "хотел": 3917, + "гру": 3918, + "заявл": 3919, + "предоста": 3920, + "поль": 3921, + "нее": 3922, + "резо": 3923, + "перегово": 3924, + "резолю": 3925, + "крет": 3926, + "поддерж": 3927, + "обеспе": 3928, + "него": 3929, + "представит": 3930, + "наде": 3931, + "кри": 3932, + "чь": 3933, + "проек": 3934, + "лет": 3935, + "други": 3936, + "_": 3937, + "،": 3938, + "؛": 3939, + "؟": 3940, + "ء": 3941, + "آ": 3942, + "أ": 3943, + "ؤ": 3944, + "إ": 3945, + "ئ": 3946, + "ا": 3947, + "ب": 3948, + "ة": 3949, + "ت": 3950, + "ث": 3951, + "ج": 3952, + "ح": 3953, + "خ": 3954, + "د": 3955, + "ذ": 3956, + "ر": 3957, + "ز": 3958, + "س": 3959, + "ش": 3960, + "ص": 3961, + "ض": 3962, + "ط": 3963, + "ظ": 3964, + "ع": 3965, + "غ": 3966, + "ـ": 3967, + "ف": 3968, + "ق": 3969, + "ك": 3970, + "ل": 3971, + "م": 3972, + "ن": 3973, + "ه": 3974, + "و": 3975, + "ى": 3976, + "ي": 3977, + "ً": 3978, + "ٌ": 3979, + "ٍ": 3980, + "َ": 3981, + "ُ": 3982, + "ِ": 3983, + "ّ": 3984, + "ْ": 3985, + "ٰ": 3986, + "چ": 3987, + "ڨ": 3988, + "ک": 3989, + "ھ": 3990, + "ی": 3991, + "ۖ": 3992, + "ۗ": 3993, + "ۘ": 3994, + "ۚ": 3995, + "ۛ": 3996, + "—": 3997, + "☭": 3998, + "ﺃ": 3999, + "ﻻ": 4000, + "ال": 4001, + "َا": 4002, + "وَ": 4003, + "َّ": 4004, + "ِي": 4005, + "أَ": 4006, + "لَ": 4007, + "نَ": 4008, + "الْ": 4009, + "هُ": 4010, + "ُو": 4011, + "ما": 4012, + "نْ": 4013, + "من": 4014, + "عَ": 4015, + "نا": 4016, + "لا": 4017, + "مَ": 4018, + "تَ": 4019, + "فَ": 4020, + "أن": 4021, + "لي": 4022, + "مِ": 4023, + "ان": 4024, + "في": 4025, + "رَ": 4026, + "يَ": 4027, + "هِ": 4028, + "مْ": 4029, + "قَ": 4030, + "بِ": 4031, + "لى": 4032, + "ين": 4033, + "إِ": 4034, + "لِ": 4035, + "وا": 4036, + "كَ": 4037, + "ها": 4038, + "ًا": 4039, + "مُ": 4040, + "ون": 4041, + "الم": 4042, + "بَ": 4043, + "يا": 4044, + "ذا": 4045, + "سا": 4046, + "الل": 4047, + "مي": 4048, + "يْ": 4049, + "را": 4050, + "ري": 4051, + "لك": 4052, + "مَا": 4053, + "نَّ": 4054, + "لم": 4055, + "إن": 4056, + "ست": 4057, + "وم": 4058, + "َّا": 4059, + "لَا": 4060, + "هم": 4061, + "ِّ": 4062, + "كُ": 4063, + "كان": 4064, + "سَ": 4065, + "با": 4066, + "دي": 4067, + "حَ": 4068, + "عْ": 4069, + "بي": 4070, + "الأ": 4071, + "ول": 4072, + "فِي": 4073, + "رِ": 4074, + "دا": 4075, + "مِنْ": 4076, + "ُونَ": 4077, + "وْ": 4078, + "هَا": 4079, + "ُّ": 4080, + "الس": 4081, + "الَ": 4082, + "ني": 4083, + "لْ": 4084, + "تُ": 4085, + "هل": 4086, + "رة": 4087, + "دَ": 4088, + "سْ": 4089, + "تِ": 4090, + "نَا": 4091, + "رْ": 4092, + "اللَّ": 4093, + "سامي": 4094, + "كن": 4095, + "كل": 4096, + "هَ": 4097, + "عَلَ": 4098, + "على": 4099, + "مع": 4100, + "إلى": 4101, + "قد": 4102, + "الر": 4103, + "ُوا": 4104, + "ير": 4105, + "عن": 4106, + "يُ": 4107, + "نِ": 4108, + "بْ": 4109, + "الح": 4110, + "هُمْ": 4111, + "قا": 4112, + "ذه": 4113, + "الت": 4114, + "ِينَ": 4115, + "جَ": 4116, + "هذا": 4117, + "عد": 4118, + "الع": 4119, + "دْ": 4120, + "قَالَ": 4121, + "رُ": 4122, + "يم": 4123, + "ية": 4124, + "نُ": 4125, + "خَ": 4126, + "رب": 4127, + "الك": 4128, + "وَا": 4129, + "أنا": 4130, + "ةِ": 4131, + "الن": 4132, + "حد": 4133, + "عِ": 4134, + "تا": 4135, + "هو": 4136, + "فا": 4137, + "عا": 4138, + "الش": 4139, + "لُ": 4140, + "يت": 4141, + "ذَا": 4142, + "يع": 4143, + "الذ": 4144, + "حْ": 4145, + "الص": 4146, + "إِنَّ": 4147, + "جا": 4148, + "علي": 4149, + "كَا": 4150, + "بُ": 4151, + "تع": 4152, + "وق": 4153, + "مل": 4154, + "لَّ": 4155, + "يد": 4156, + "أخ": 4157, + "رف": 4158, + "تي": 4159, + "الِ": 4160, + "ّا": 4161, + "ذلك": 4162, + "أَنْ": 4163, + "سِ": 4164, + "توم": 4165, + "مر": 4166, + "مَنْ": 4167, + "بل": 4168, + "الق": 4169, + "الله": 4170, + "ِيَ": 4171, + "كم": 4172, + "ذَ": 4173, + "عل": 4174, + "حب": 4175, + "سي": 4176, + "عُ": 4177, + "الج": 4178, + "الد": 4179, + "شَ": 4180, + "تك": 4181, + "فْ": 4182, + "صَ": 4183, + "لل": 4184, + "دِ": 4185, + "بر": 4186, + "فِ": 4187, + "ته": 4188, + "أع": 4189, + "تْ": 4190, + "قْ": 4191, + "الْأَ": 4192, + "ئِ": 4193, + "عَنْ": 4194, + "ور": 4195, + "حا": 4196, + "الَّ": 4197, + "مت": 4198, + "فر": 4199, + "دُ": 4200, + "هنا": 4201, + "وَأَ": 4202, + "تب": 4203, + "ةُ": 4204, + "أي": 4205, + "سب": 4206, + "ريد": 4207, + "وج": 4208, + "كُمْ": 4209, + "حِ": 4210, + "كْ": 4211, + "در": 4212, + "َاء": 4213, + "هذه": 4214, + "الط": 4215, + "الْمُ": 4216, + "دة": 4217, + "قل": 4218, + "غَ": 4219, + "يوم": 4220, + "الَّذ": 4221, + "كر": 4222, + "تر": 4223, + "كِ": 4224, + "كي": 4225, + "عَلَى": 4226, + "رَب": 4227, + "عة": 4228, + "قُ": 4229, + "جْ": 4230, + "فض": 4231, + "لة": 4232, + "هْ": 4233, + "رَا": 4234, + "وَلَ": 4235, + "الْمَ": 4236, + "أَنَّ": 4237, + "يَا": 4238, + "أُ": 4239, + "شي": 4240, + "اللَّهُ": 4241, + "لَى": 4242, + "قِ": 4243, + "أت": 4244, + "عَلَيْ": 4245, + "اللَّهِ": 4246, + "الب": 4247, + "ضَ": 4248, + "ةً": 4249, + "قي": 4250, + "ار": 4251, + "بد": 4252, + "خْ": 4253, + "سْتَ": 4254, + "طَ": 4255, + "قَدْ": 4256, + "ذهب": 4257, + "أم": 4258, + "ماذا": 4259, + "وَإِ": 4260, + "ةٌ": 4261, + "ونَ": 4262, + "ليلى": 4263, + "ولا": 4264, + "حُ": 4265, + "هي": 4266, + "صل": 4267, + "الخ": 4268, + "ود": 4269, + "ليس": 4270, + "لدي": 4271, + "قال": 4272, + "كَانَ": 4273, + "مَّ": 4274, + "حي": 4275, + "تم": 4276, + "لن": 4277, + "وَلَا": 4278, + "بع": 4279, + "يمكن": 4280, + "سُ": 4281, + "ةَ": 4282, + "حت": 4283, + "رًا": 4284, + "كا": 4285, + "شا": 4286, + "هِمْ": 4287, + "لَهُ": 4288, + "زَ": 4289, + "داً": 4290, + "مس": 4291, + "كث": 4292, + "الْعَ": 4293, + "جِ": 4294, + "صْ": 4295, + "فَا": 4296, + "له": 4297, + "وي": 4298, + "عَا": 4299, + "هُوَ": 4300, + "بِي": 4301, + "بَا": 4302, + "أس": 4303, + "ثَ": 4304, + "لِي": 4305, + "رض": 4306, + "الرَّ": 4307, + "لِكَ": 4308, + "تَّ": 4309, + "فُ": 4310, + "قة": 4311, + "فعل": 4312, + "مِن": 4313, + "الآ": 4314, + "ثُ": 4315, + "سم": 4316, + "مَّا": 4317, + "بِهِ": 4318, + "تق": 4319, + "خر": 4320, + "لقد": 4321, + "خل": 4322, + "شر": 4323, + "أنت": 4324, + "لَّا": 4325, + "سن": 4326, + "السَّ": 4327, + "الذي": 4328, + "سَا": 4329, + "وما": 4330, + "زل": 4331, + "وب": 4332, + "أْ": 4333, + "إذا": 4334, + "رِي": 4335, + "حة": 4336, + "نِي": 4337, + "الْحَ": 4338, + "وَقَالَ": 4339, + "به": 4340, + "ةٍ": 4341, + "سأ": 4342, + "رٌ": 4343, + "بال": 4344, + "مة": 4345, + "شْ": 4346, + "وت": 4347, + "عند": 4348, + "فس": 4349, + "بَعْ": 4350, + "هر": 4351, + "قط": 4352, + "أح": 4353, + "إنه": 4354, + "وع": 4355, + "فت": 4356, + "غا": 4357, + "هناك": 4358, + "بت": 4359, + "مِنَ": 4360, + "سر": 4361, + "ذَلِكَ": 4362, + "رس": 4363, + "حدث": 4364, + "غْ": 4365, + "ِّي": 4366, + "الإ": 4367, + "وَيَ": 4368, + "جل": 4369, + "است": 4370, + "قِي": 4371, + "عب": 4372, + "وس": 4373, + "يش": 4374, + "الَّذِينَ": 4375, + "تاب": 4376, + "دِي": 4377, + "جب": 4378, + "كون": 4379, + "بن": 4380, + "الث": 4381, + "لَيْ": 4382, + "بعد": 4383, + "وَالْ": 4384, + "فَأَ": 4385, + "عم": 4386, + "هُم": 4387, + "تن": 4388, + "ذْ": 4389, + "أص": 4390, + "أين": 4391, + "رَبِّ": 4392, + "الذين": 4393, + "إِن": 4394, + "بين": 4395, + "جُ": 4396, + "عَلَيْهِ": 4397, + "حَا": 4398, + "لو": 4399, + "ستط": 4400, + "ظر": 4401, + "لَمْ": 4402, + "ءِ": 4403, + "كُل": 4404, + "طل": 4405, + "تَا": 4406, + "ضُ": 4407, + "كنت": 4408, + "لًا": 4409, + "مٌ": 4410, + "قبل": 4411, + "ــ": 4412, + "ذِ": 4413, + "قَوْ": 4414, + "صِ": 4415, + "مًا": 4416, + "كانت": 4417, + "صا": 4418, + "يق": 4419, + "الف": 4420, + "النا": 4421, + "مٍ": 4422, + "إِنْ": 4423, + "النَّ": 4424, + "جد": 4425, + "وَمَا": 4426, + "تت": 4427, + "بح": 4428, + "مكان": 4429, + "كيف": 4430, + "ّة": 4431, + "الا": 4432, + "جَا": 4433, + "أو": 4434, + "ساعد": 4435, + "ضِ": 4436, + "إلا": 4437, + "راً": 4438, + "قَا": 4439, + "رأ": 4440, + "عت": 4441, + "أحد": 4442, + "هد": 4443, + "ضا": 4444, + "طر": 4445, + "أق": 4446, + "ماء": 4447, + "دَّ": 4448, + "البا": 4449, + "مُو": 4450, + "أَوْ": 4451, + "طا": 4452, + "قُو": 4453, + "خِ": 4454, + "تل": 4455, + "ستطيع": 4456, + "دَا": 4457, + "النَّا": 4458, + "إلَى": 4459, + "وَتَ": 4460, + "هَذَا": 4461, + "بة": 4462, + "عليك": 4463, + "جر": 4464, + "المن": 4465, + "زا": 4466, + "رٍ": 4467, + "دع": 4468, + "ًّا": 4469, + "سة": 4470, + "ثُمَّ": 4471, + "شيء": 4472, + "الغ": 4473, + "تح": 4474, + "رُونَ": 4475, + "اليوم": 4476, + "مِي": 4477, + "نُوا": 4478, + "أر": 4479, + "تُمْ": 4480, + "عر": 4481, + "يف": 4482, + "أب": 4483, + "دًا": 4484, + "صَا": 4485, + "التَّ": 4486, + "أريد": 4487, + "الز": 4488, + "يَوْ": 4489, + "إلي": 4490, + "جي": 4491, + "يَعْ": 4492, + "فضل": 4493, + "الإن": 4494, + "أنه": 4495, + "1": 4496, + "2": 4497, + "3": 4498, + "4": 4499, + "5": 4500, + "·": 4501, + "×": 4502, + "̃": 4503, + "̌": 4504, + "ε": 4505, + "λ": 4506, + "μ": 4507, + "•": 4508, + "‧": 4509, + "─": 4510, + "□": 4511, + "、": 4512, + "。": 4513, + "〈": 4514, + "〉": 4515, + "《": 4516, + "》": 4517, + "「": 4518, + "」": 4519, + "『": 4520, + "』": 4521, + "ア": 4522, + "オ": 4523, + "カ": 4524, + "チ": 4525, + "ド": 4526, + "ベ": 4527, + "ャ": 4528, + "ヤ": 4529, + "ン": 4530, + "・": 4531, + "ー": 4532, + "ㄟ": 4533, + "!": 4534, + "(": 4535, + ")": 4536, + ",": 4537, + "-": 4538, + "/": 4539, + ":": 4540, + ";": 4541, + "?": 4542, + "p": 4543, + "i4": 4544, + "zh": 4545, + "i2": 4546, + "ng1": 4547, + "u4": 4548, + "i1": 4549, + "ng2": 4550, + "u3": 4551, + "de5": 4552, + "e4": 4553, + "i3": 4554, + "ng4": 4555, + "an4": 4556, + "shi4": 4557, + "an2": 4558, + "u2": 4559, + "u1": 4560, + "ng3": 4561, + "a1": 4562, + "an1": 4563, + "e2": 4564, + "a4": 4565, + "ei4": 4566, + "ong1": 4567, + "ai4": 4568, + "ao4": 4569, + "ang1": 4570, + "an3": 4571, + "wei4": 4572, + "uo2": 4573, + "n1": 4574, + "en2": 4575, + "ao3": 4576, + "e1": 4577, + "qi": 4578, + "eng2": 4579, + "zho": 4580, + "ang3": 4581, + "ang4": 4582, + "ang2": 4583, + "uo4": 4584, + "ge4": 4585, + "yi1": 4586, + "guo2": 4587, + "a3": 4588, + "he2": 4589, + "e3": 4590, + "yi2": 4591, + "di4": 4592, + "zhong1": 4593, + "bu4": 4594, + "ai2": 4595, + "n2": 4596, + "zai4": 4597, + "shi2": 4598, + "eng1": 4599, + "ren2": 4600, + "ong2": 4601, + "xian4": 4602, + "xu": 4603, + "n4": 4604, + "li4": 4605, + "en4": 4606, + "yu2": 4607, + "ei2": 4608, + "yi2ge4": 4609, + "ou4": 4610, + "ei3": 4611, + "ui4": 4612, + "a2": 4613, + "you3": 4614, + "ao1": 4615, + "da4": 4616, + "cheng2": 4617, + "en1": 4618, + "eng4": 4619, + "yi4": 4620, + "si1": 4621, + "zhi4": 4622, + "jia1": 4623, + "yuan2": 4624, + "ta1": 4625, + "de5yi2ge4": 4626, + "ke1": 4627, + "shu3": 4628, + "xi1": 4629, + "ji2": 4630, + "ao2": 4631, + "ou3": 4632, + "ong4": 4633, + "xia4": 4634, + "ai1": 4635, + "gong1": 4636, + "zhi1": 4637, + "en3": 4638, + "wei2": 4639, + "xue2": 4640, + "qu1": 4641, + "zhou1": 4642, + "er3": 4643, + "ming2": 4644, + "zhong3": 4645, + "li3": 4646, + "wu4": 4647, + "yi3": 4648, + "uo1": 4649, + "e5": 4650, + "ji4": 4651, + "xing2": 4652, + "jian4": 4653, + "hua4": 4654, + "yu3": 4655, + "uo3": 4656, + "ji1": 4657, + "ai3": 4658, + "zuo4": 4659, + "hou4": 4660, + "hui4": 4661, + "ei1": 4662, + "nian2": 4663, + "qi2": 4664, + "dao4": 4665, + "sheng1": 4666, + "de2": 4667, + "dai4": 4668, + "uan2": 4669, + "zhe4": 4670, + "zheng4": 4671, + "ben3": 4672, + "shang4": 4673, + "zhu3": 4674, + "bei4": 4675, + "ye4": 4676, + "chu1": 4677, + "zhan4": 4678, + "le5": 4679, + "lai2": 4680, + "shi3": 4681, + "nan2": 4682, + "ren4": 4683, + "you2": 4684, + "ke4": 4685, + "ba1": 4686, + "fu4": 4687, + "dui4": 4688, + "ya4": 4689, + "mei3": 4690, + "zi4": 4691, + "xin1": 4692, + "jing1": 4693, + "zhu": 4694, + "n3": 4695, + "yong4": 4696, + "mu4": 4697, + "jiao4": 4698, + "ye3": 4699, + "jin4": 4700, + "bian4": 4701, + "lu4": 4702, + "qi1": 4703, + "she4": 4704, + "xiang1": 4705, + "ong3": 4706, + "shu4": 4707, + "dong4": 4708, + "suo3": 4709, + "guan1": 4710, + "san1": 4711, + "te4": 4712, + "duo1": 4713, + "fu2": 4714, + "min2": 4715, + "la1": 4716, + "zhi2": 4717, + "zhen4": 4718, + "ou1": 4719, + "wu3": 4720, + "ma3": 4721, + "i5": 4722, + "zi5": 4723, + "ju4": 4724, + "er4": 4725, + "yao4": 4726, + "xia4de5yi2ge4": 4727, + "si4": 4728, + "tu2": 4729, + "shan1": 4730, + "zui4": 4731, + "yin1": 4732, + "er2": 4733, + "tong2": 4734, + "dong1": 4735, + "yu4": 4736, + "yan2": 4737, + "qian2": 4738, + "shu3xia4de5yi2ge4": 4739, + "jun1": 4740, + "ke3": 4741, + "wen2": 4742, + "fa3": 4743, + "luo2": 4744, + "zhu4": 4745, + "xi4": 4746, + "kou3": 4747, + "bei3": 4748, + "jian1": 4749, + "fa1": 4750, + "dian4": 4751, + "jiang1": 4752, + "wei4yu2": 4753, + "xiang4": 4754, + "zhi3": 4755, + "eng3": 4756, + "fang1": 4757, + "lan2": 4758, + "shu": 4759, + "ri4": 4760, + "lian2": 4761, + "shou3": 4762, + "qiu2": 4763, + "jin1": 4764, + "huo4": 4765, + "shu3xia4de5yi2ge4zhong3": 4766, + "fen1": 4767, + "nei4": 4768, + "gai1": 4769, + "mei3guo2": 4770, + "un2": 4771, + "ge2": 4772, + "bao3": 4773, + "qing1": 4774, + "gao1": 4775, + "tai2": 4776, + "xiao3": 4777, + "jie2": 4778, + "tian1": 4779, + "chang2": 4780, + "quan2": 4781, + "lie4": 4782, + "hai3": 4783, + "fei1": 4784, + "ti3": 4785, + "jue2": 4786, + "ou2": 4787, + "ci3": 4788, + "zu2": 4789, + "ni2": 4790, + "biao3": 4791, + "zhong1guo2": 4792, + "du4": 4793, + "yue4": 4794, + "xing4": 4795, + "sheng4": 4796, + "che1": 4797, + "dan1": 4798, + "jie1": 4799, + "lin2": 4800, + "ping2": 4801, + "fu3": 4802, + "gu3": 4803, + "jie4": 4804, + "v3": 4805, + "sheng3": 4806, + "na4": 4807, + "yuan4": 4808, + "zhang3": 4809, + "guan3": 4810, + "dao3": 4811, + "zu3": 4812, + "ding4": 4813, + "dian3": 4814, + "ceng2": 4815, + "ren2kou3": 4816, + "tai4": 4817, + "tong1": 4818, + "guo4": 4819, + "neng2": 4820, + "chang3": 4821, + "hua2": 4822, + "liu2": 4823, + "ying1": 4824, + "xiao4": 4825, + "ci4": 4826, + "bian4hua4": 4827, + "liang3": 4828, + "gong4": 4829, + "zhong4": 4830, + "de5yi1": 4831, + "se4": 4832, + "kai1": 4833, + "wang2": 4834, + "jiu4": 4835, + "shi1": 4836, + "shou4": 4837, + "mei2": 4838, + "feng1": 4839, + "ze2": 4840, + "tu2shi4": 4841, + "ti2": 4842, + "qi4": 4843, + "jiu3": 4844, + "shen1": 4845, + "zhe3": 4846, + "ren2kou3bian4hua4": 4847, + "ren2kou3bian4hua4tu2shi4": 4848, + "di4qu1": 4849, + "yang2": 4850, + "men5": 4851, + "long2": 4852, + "bing4": 4853, + "chan3": 4854, + "zhu1": 4855, + "wei3": 4856, + "wai4": 4857, + "xing1": 4858, + "bo1": 4859, + "bi3": 4860, + "tang2": 4861, + "hua1": 4862, + "bo2": 4863, + "shui3": 4864, + "shu1": 4865, + "dou1": 4866, + "sai4": 4867, + "chao2": 4868, + "bi4": 4869, + "ling2": 4870, + "lei4": 4871, + "da4xue2": 4872, + "fen4": 4873, + "shu3de5": 4874, + "mu3": 4875, + "jiao1": 4876, + "dang1": 4877, + "cheng1": 4878, + "tong3": 4879, + "nv3": 4880, + "qi3": 4881, + "yan3": 4882, + "mian4": 4883, + "luo4": 4884, + "jing4": 4885, + "ge1": 4886, + "ru4": 4887, + "dan4": 4888, + "ri4ben3": 4889, + "pu3": 4890, + "yun4": 4891, + "huang2": 4892, + "wo3": 4893, + "lv": 4894, + "hai2": 4895, + "shi4yi1": 4896, + "xie1": 4897, + "ying3": 4898, + "wu2": 4899, + "shen2": 4900, + "wang3": 4901, + "guang3": 4902, + "liu4": 4903, + "su4": 4904, + "shi4zhen4": 4905, + "can1": 4906, + "cao3": 4907, + "xia2": 4908, + "ka3": 4909, + "da2": 4910, + "hu4": 4911, + "ban4": 4912, + "dang3": 4913, + "hu2": 4914, + "zong3": 4915, + "deng3": 4916, + "de5yi2ge4shi4zhen4": 4917, + "chuan2": 4918, + "mo4": 4919, + "zhang1": 4920, + "ban1": 4921, + "mo2": 4922, + "cha2": 4923, + "ce4": 4924, + "zhu3yao4": 4925, + "tou2": 4926, + "ju2": 4927, + "shi4wei4yu2": 4928, + "sa4": 4929, + "un1": 4930, + "ke3yi3": 4931, + "du1": 4932, + "han4": 4933, + "liang4": 4934, + "sha1": 4935, + "jia3": 4936, + "zi1": 4937, + "lv4": 4938, + "fu1": 4939, + "xian1": 4940, + "xu4": 4941, + "guang1": 4942, + "meng2": 4943, + "bao4": 4944, + "you4": 4945, + "rong2": 4946, + "zhi1yi1": 4947, + "wei1": 4948, + "mao2": 4949, + "guo2jia1": 4950, + "cong2": 4951, + "gou4": 4952, + "tie3": 4953, + "zhen1": 4954, + "du2": 4955, + "bian1": 4956, + "ci2": 4957, + "qu3": 4958, + "fan4": 4959, + "xiang3": 4960, + "men2": 4961, + "ju1": 4962, + "hong2": 4963, + "zi3": 4964, + "ta1men5": 4965, + "ji3": 4966, + "zong1": 4967, + "zhou1de5yi2ge4shi4zhen4": 4968, + "tuan2": 4969, + "jing3": 4970, + "gong1si1": 4971, + "xie4": 4972, + "li2": 4973, + "li4shi3": 4974, + "bao1": 4975, + "gang3": 4976, + "gui1": 4977, + "zheng1": 4978, + "zhi2wu4": 4979, + "ta1de5": 4980, + "pin3": 4981, + "zhuan1": 4982, + "chong2": 4983, + "shi3yong4": 4984, + "wa3": 4985, + "shuo1": 4986, + "chuan1": 4987, + "lei2": 4988, + "wan1": 4989, + "huo2": 4990, + "su1": 4991, + "zao3": 4992, + "gai3": 4993, + "qu4": 4994, + "gu4": 4995, + "xi2": 4996, + "hang2": 4997, + "ying4": 4998, + "cun1": 4999, + "gen1": 5000, + "ying2": 5001, + "ting2": 5002, + "cheng2shi4": 5003, + "jiang3": 5004, + "ling3": 5005, + "lun2": 5006, + "bu4fen4": 5007, + "deng1": 5008, + "xuan3": 5009, + "dong4wu4": 5010, + "de2guo2": 5011, + "xian3": 5012, + "fan3": 5013, + "zhe5": 5014, + "han2": 5015, + "hao4": 5016, + "mi4": 5017, + "ran2": 5018, + "qin1": 5019, + "tiao2": 5020, + "zhan3": 5021, + "[ar]": 5022, + "[zh-cn]": 5023, + "¡": 5024, + "é": 5025, + "shi": 5026, + "tsu": 5027, + "teki": 5028, + "nai": 5029, + "aru": 5030, + "uu": 5031, + "kai": 5032, + "shite": 5033, + "mono": 5034, + "koto": 5035, + "kara": 5036, + "shita": 5037, + "suru": 5038, + "masu": 5039, + "tai": 5040, + "ware": 5041, + "shin": 5042, + "oku": 5043, + "yuu": 5044, + "iru": 5045, + "jiko": 5046, + "desu": 5047, + "rare": 5048, + "shou": 5049, + "sha": 5050, + "sekai": 5051, + "kyou": 5052, + "mashita": 5053, + "nara": 5054, + "kei": 5055, + "ita": 5056, + "ari": 5057, + "itsu": 5058, + "kono": 5059, + "naka": 5060, + "chou": 5061, + "sore": 5062, + "naru": 5063, + "gaku": 5064, + "reba": 5065, + "hito": 5066, + "sai": 5067, + "nan": 5068, + "dai": 5069, + "tsuku": 5070, + "shiki": 5071, + "sare": 5072, + "naku": 5073, + "jun": 5074, + "kaku": 5075, + "zai": 5076, + "wata": 5077, + "shuu": 5078, + "ii": 5079, + "kare": 5080, + "shii": 5081, + "made": 5082, + "sho": 5083, + "kereba": 5084, + "shika": 5085, + "ichi": 5086, + "deki": 5087, + "nin": 5088, + "wareware": 5089, + "nakereba": 5090, + "oite": 5091, + "yaku": 5092, + "mujun": 5093, + "yoku": 5094, + "butsu": 5095, + "omo": 5096, + "gae": 5097, + "naranai": 5098, + "tachi": 5099, + "chuu": 5100, + "kangae": 5101, + "toki": 5102, + "koro": 5103, + "mujunteki": 5104, + "naga": 5105, + "jin": 5106, + "shima": 5107, + "iku": 5108, + "imasu": 5109, + "hon": 5110, + "kae": 5111, + "kore": 5112, + "kita": 5113, + "datta": 5114, + "jitsu": 5115, + "mae": 5116, + "toku": 5117, + "douitsu": 5118, + "ritsu": 5119, + "kyuu": 5120, + "hyou": 5121, + "rareta": 5122, + "keisei": 5123, + "kkan": 5124, + "rareru": 5125, + "mou": 5126, + "doko": 5127, + "ryou": 5128, + "dake": 5129, + "nakatta": 5130, + "soko": 5131, + "tabe": 5132, + "hana": 5133, + "fuku": 5134, + "yasu": 5135, + "wataku": 5136, + "yama": 5137, + "kyo": 5138, + "genzai": 5139, + "boku": 5140, + "ata": 5141, + "kawa": 5142, + "masen": 5143, + "juu": 5144, + "natte": 5145, + "watakushi": 5146, + "yotte": 5147, + "hai": 5148, + "jishin": 5149, + "rete": 5150, + "oka": 5151, + "kagaku": 5152, + "natta": 5153, + "karu": 5154, + "nari": 5155, + "mata": 5156, + "kuru": 5157, + "gai": 5158, + "kari": 5159, + "shakai": 5160, + "koui": 5161, + "yori": 5162, + "setsu": 5163, + "reru": 5164, + "tokoro": 5165, + "jutsu": 5166, + "saku": 5167, + "ttai": 5168, + "ningen": 5169, + "tame": 5170, + "kankyou": 5171, + "ooku": 5172, + "watashi": 5173, + "tsukuru": 5174, + "sugi": 5175, + "jibun": 5176, + "shitsu": 5177, + "keru": 5178, + "kishi": 5179, + "shikashi": 5180, + "moto": 5181, + "mari": 5182, + "itte": 5183, + "deshita": 5184, + "nde": 5185, + "arimasu": 5186, + "koe": 5187, + "zettai": 5188, + "kkanteki": 5189, + "rekishi": 5190, + "dekiru": 5191, + "tsuka": 5192, + "itta": 5193, + "kobutsu": 5194, + "miru": 5195, + "shoku": 5196, + "shimasu": 5197, + "gijutsu": 5198, + "gyou": 5199, + "joushiki": 5200, + "atta": 5201, + "hodo": 5202, + "koko": 5203, + "tsukurareta": 5204, + "zoku": 5205, + "hitei": 5206, + "koku": 5207, + "rekishiteki": 5208, + "kete": 5209, + "kako": 5210, + "nagara": 5211, + "kakaru": 5212, + "shutai": 5213, + "haji": 5214, + "taku": 5215, + "douitsuteki": 5216, + "mete": 5217, + "tsuu": 5218, + "sarete": 5219, + "genjitsu": 5220, + "bai": 5221, + "nawa": 5222, + "jikan": 5223, + "waru": 5224, + "rt": 5225, + "atsu": 5226, + "soku": 5227, + "kouiteki": 5228, + "kata": 5229, + "tetsu": 5230, + "gawa": 5231, + "kedo": 5232, + "reta": 5233, + "sayou": 5234, + "tteru": 5235, + "tori": 5236, + "kimi": 5237, + "mura": 5238, + "sareru": 5239, + "machi": 5240, + "kya": 5241, + "osa": 5242, + "konna": 5243, + "aku": 5244, + "sareta": 5245, + "ipp": 5246, + "shiku": 5247, + "uchi": 5248, + "hitotsu": 5249, + "hatara": 5250, + "tachiba": 5251, + "shiro": 5252, + "katachi": 5253, + "tomo": 5254, + "ete": 5255, + "meru": 5256, + "nichi": 5257, + "dare": 5258, + "katta": 5259, + "eru": 5260, + "suki": 5261, + "ooki": 5262, + "maru": 5263, + "moku": 5264, + "oko": 5265, + "kangaerareru": 5266, + "oto": 5267, + "tanni": 5268, + "tada": 5269, + "taiteki": 5270, + "motte": 5271, + "kinou": 5272, + "shinai": 5273, + "kki": 5274, + "tari": 5275, + "ranai": 5276, + "kkou": 5277, + "mirai": 5278, + "ppon": 5279, + "goto": 5280, + "hitsu": 5281, + "teru": 5282, + "mochi": 5283, + "katsu": 5284, + "nyuu": 5285, + "zuka": 5286, + "tsuite": 5287, + "nomi": 5288, + "sugu": 5289, + "kuda": 5290, + "tetsugaku": 5291, + "ika": 5292, + "ronri": 5293, + "oki": 5294, + "nippon": 5295, + "shimashita": 5296, + "chishiki": 5297, + "chokkanteki": 5298, + "suko": 5299, + "kuu": 5300, + "arou": 5301, + "katte": 5302, + "kuri": 5303, + "inai": 5304, + "hyougen": 5305, + "ishiki": 5306, + "doku": 5307, + "atte": 5308, + "atara": 5309, + "wari": 5310, + "kao": 5311, + "seisan": 5312, + "hanashi": 5313, + "kake": 5314, + "naji": 5315, + "sunawa": 5316, + "sunawachi": 5317, + "ugo": 5318, + "suu": 5319, + "bara": 5320, + "hiro": 5321, + "iwa": 5322, + "betsu": 5323, + "yoi": 5324, + "seru": 5325, + "shiteru": 5326, + "rarete": 5327, + "toshi": 5328, + "seki": 5329, + "tairitsu": 5330, + "wakara": 5331, + "tokyo": 5332, + "kka": 5333, + "kyoku": 5334, + "iro": 5335, + "mite": 5336, + "saki": 5337, + "kanji": 5338, + "mita": 5339, + "sube": 5340, + "ryoku": 5341, + "matta": 5342, + "kudasai": 5343, + "omoi": 5344, + "wareru": 5345, + "hitsuyou": 5346, + "kashi": 5347, + "renai": 5348, + "kankei": 5349, + "gatte": 5350, + "ochi": 5351, + "motsu": 5352, + "sonzai": 5353, + "taishite": 5354, + "ame": 5355, + "seimei": 5356, + "kano": 5357, + "giri": 5358, + "kangaeru": 5359, + "yue": 5360, + "asa": 5361, + "onaji": 5362, + "yoru": 5363, + "niku": 5364, + "osaka": 5365, + "sukoshi": 5366, + "tama": 5367, + "kanojo": 5368, + "kite": 5369, + "mondai": 5370, + "amari": 5371, + "eki": 5372, + "kojin": 5373, + "haya": 5374, + "dete": 5375, + "atarashii": 5376, + "awa": 5377, + "gakkou": 5378, + "tsuzu": 5379, + "shukan": 5380, + "imashita": 5381, + "atae": 5382, + "darou": 5383, + "hataraku": 5384, + "gata": 5385, + "dachi": 5386, + "matsu": 5387, + "arimasen": 5388, + "seibutsu": 5389, + "mitsu": 5390, + "heya": 5391, + "yasui": 5392, + "deni": 5393, + "noko": 5394, + "haha": 5395, + "domo": 5396, + "kami": 5397, + "sudeni": 5398, + "nao": 5399, + "raku": 5400, + "ike": 5401, + "meta": 5402, + "kodomo": 5403, + "soshite": 5404, + "game": 5405, + "bakari": 5406, + "tote": 5407, + "hatsu": 5408, + "mise": 5409, + "mokuteki": 5410, + "dakara": 5411, + "[ja]": 5412, + "ő": 5413, + "ű": 5414, + "そ": 5415, + "な": 5416, + "ん": 5417, + "포": 5418, + "�": 5419, + "gy": 5420, + "eg": 5421, + "cs": 5422, + "ál": 5423, + "egy": 5424, + "át": 5425, + "ott": 5426, + "ett": 5427, + "meg": 5428, + "hogy": 5429, + "ég": 5430, + "ól": 5431, + "nek": 5432, + "volt": 5433, + "ág": 5434, + "nk": 5435, + "ék": 5436, + "ít": 5437, + "ák": 5438, + "ud": 5439, + "szer": 5440, + "mind": 5441, + "oz": 5442, + "ép": 5443, + "ért": 5444, + "mond": 5445, + "szt": 5446, + "nak": 5447, + "ől": 5448, + "csak": 5449, + "oly": 5450, + "áll": 5451, + "ány": 5452, + "mint": 5453, + "már": 5454, + "ött": 5455, + "nagy": 5456, + "ész": 5457, + "azt": 5458, + "elő": 5459, + "tud": 5460, + "ény": 5461, + "áz": 5462, + "még": 5463, + "köz": 5464, + "ely": 5465, + "ség": 5466, + "hoz": 5467, + "uk": 5468, + "kez": 5469, + "ám": 5470, + "aj": 5471, + "unk": 5472, + "vagy": 5473, + "szem": 5474, + "ember": 5475, + "fog": 5476, + "mert": 5477, + "ös": 5478, + "ság": 5479, + "leg": 5480, + "ünk": 5481, + "hát": 5482, + "ony": 5483, + "ezt": 5484, + "minden": 5485, + "ült": 5486, + "jó": 5487, + "kis": 5488, + "áj": 5489, + "úgy": 5490, + "most": 5491, + "ír": 5492, + "itt": 5493, + "elt": 5494, + "mondta": 5495, + "kell": 5496, + "ált": 5497, + "érd": 5498, + "tö": 5499, + "vár": 5500, + "lát": 5501, + "ők": 5502, + "vet": 5503, + "után": 5504, + "két": 5505, + "nap": 5506, + "ív": 5507, + "ály": 5508, + "vég": 5509, + "ök": 5510, + "dul": 5511, + "néz": 5512, + "ában": 5513, + "kül": 5514, + "akkor": 5515, + "szél": 5516, + "új": 5517, + "olyan": 5518, + "ked": 5519, + "hely": 5520, + "tör": 5521, + "ból": 5522, + "elm": 5523, + "ára": 5524, + "ló": 5525, + "volna": 5526, + "lehet": 5527, + "ebb": 5528, + "sok": 5529, + "olt": 5530, + "eket": 5531, + "bor": 5532, + "fej": 5533, + "gond": 5534, + "akar": 5535, + "fél": 5536, + "úl": 5537, + "otta": 5538, + "valami": 5539, + "jel": 5540, + "éd": 5541, + "arc": 5542, + "hall": 5543, + "föl": 5544, + "ába": 5545, + "olg": 5546, + "kir": 5547, + "old": 5548, + "kérd": 5549, + "jár": 5550, + "úr": 5551, + "zs": 5552, + "élet": 5553, + "ját": 5554, + "ov": 5555, + "éz": 5556, + "vil": 5557, + "őr": 5558, + "ög": 5559, + "lesz": 5560, + "koz": 5561, + "ább": 5562, + "király": 5563, + "eng": 5564, + "igaz": 5565, + "haj": 5566, + "kod": 5567, + "ról": 5568, + "több": 5569, + "szó": 5570, + "ében": 5571, + "öt": 5572, + "nyi": 5573, + "szól": 5574, + "gondol": 5575, + "egész": 5576, + "így": 5577, + "ős": 5578, + "obb": 5579, + "osan": 5580, + "ből": 5581, + "abb": 5582, + "őt": 5583, + "nál": 5584, + "kép": 5585, + "aztán": 5586, + "tart": 5587, + "beszél": 5588, + "előtt": 5589, + "aszt": 5590, + "maj": 5591, + "kör": 5592, + "hang": 5593, + "íz": 5594, + "incs": 5595, + "év": 5596, + "ód": 5597, + "ók": 5598, + "hozz": 5599, + "okat": 5600, + "nagyon": 5601, + "ház": 5602, + "ped": 5603, + "ezte": 5604, + "etlen": 5605, + "neki": 5606, + "majd": 5607, + "szony": 5608, + "ának": 5609, + "felé": 5610, + "egyszer": 5611, + "adt": 5612, + "gyer": 5613, + "amikor": 5614, + "foly": 5615, + "szak": 5616, + "őd": 5617, + "hú": 5618, + "ász": 5619, + "amely": 5620, + "ére": 5621, + "ilyen": 5622, + "oda": 5623, + "ják": 5624, + "tár": 5625, + "ával": 5626, + "lak": 5627, + "gyan": 5628, + "ély": 5629, + "út": 5630, + "kezd": 5631, + "mell": 5632, + "mikor": 5633, + "hez": 5634, + "való": 5635, + "szeret": 5636, + "rend": 5637, + "vissza": 5638, + "fő": 5639, + "asszony": 5640, + "ről": 5641, + "pedig": 5642, + "szép": 5643, + "ták": 5644, + "öv": 5645, + "világ": 5646, + "maga": 5647, + "szik": 5648, + "éj": 5649, + "ént": 5650, + "jött": 5651, + "szí": 5652, + "gat": 5653, + "ettem": 5654, + "hány": 5655, + "ást": 5656, + "ahol": 5657, + "őket": 5658, + "hár": 5659, + "nő": 5660, + "csi": 5661, + "talál": 5662, + "elte": 5663, + "látt": 5664, + "tört": 5665, + "hagy": 5666, + "esz": 5667, + "nél": 5668, + "kut": 5669, + "lány": 5670, + "amit": 5671, + "ső": 5672, + "ellen": 5673, + "magát": 5674, + "ugyan": 5675, + "külön": 5676, + "asz": 5677, + "mindig": 5678, + "lép": 5679, + "talán": 5680, + "szor": 5681, + "illan": 5682, + "nincs": 5683, + "vagyok": 5684, + "telen": 5685, + "ismer": 5686, + "isten": 5687, + "ított": 5688, + "jobb": 5689, + "ves": 5690, + "dult": 5691, + "juk": 5692, + "szen": 5693, + "öm": 5694, + "lett": 5695, + "egyik": 5696, + "bár": 5697, + "szi": 5698, + "szív": 5699, + "azon": 5700, + "eszt": 5701, + "föld": 5702, + "kuty": 5703, + "pillan": 5704, + "fér": 5705, + "től": 5706, + "tű": 5707, + "ébe": 5708, + "tött": 5709, + "barát": 5710, + "íg": 5711, + "ahogy": 5712, + "eh": 5713, + "ep": 5714, + "jelent": 5715, + "tat": 5716, + "szeg": 5717, + "mintha": 5718, + "egyen": 5719, + "szab": 5720, + "bizony": 5721, + "jon": 5722, + "öreg": 5723, + "dolg": 5724, + "csap": 5725, + "tiszt": 5726, + "állt": 5727, + "ancs": 5728, + "idő": 5729, + "ügy": 5730, + "miért": 5731, + "ót": 5732, + "csin": 5733, + "ének": 5734, + "vér": 5735, + "jól": 5736, + "alatt": 5737, + "mely": 5738, + "semmi": 5739, + "nyug": 5740, + "vág": 5741, + "követ": 5742, + "össze": 5743, + "mad": 5744, + "acs": 5745, + "fiú": 5746, + "másik": 5747, + "jön": 5748, + "szám": 5749, + "rész": 5750, + "kér": 5751, + "ével": 5752, + "[hu]": 5753, + "%": 5754, + "0": 5755, + "6": 5756, + "7": 5757, + "8": 5758, + "9": 5759, + "A": 5760, + "B": 5761, + "C": 5762, + "D": 5763, + "E": 5764, + "F": 5765, + "G": 5766, + "H": 5767, + "I": 5768, + "J": 5769, + "K": 5770, + "L": 5771, + "M": 5772, + "N": 5773, + "O": 5774, + "P": 5775, + "Q": 5776, + "R": 5777, + "S": 5778, + "T": 5779, + "U": 5780, + "V": 5781, + "W": 5782, + "X": 5783, + "Y": 5784, + "Z": 5785, + "Ł": 5786, + "α": 5787, + "ς": 5788, + "♥": 5789, + "か": 5790, + "ズ": 5791, + "因": 5792, + "国": 5793, + "怎": 5794, + "抱": 5795, + "推": 5796, + "有": 5797, + "樣": 5798, + "為": 5799, + "群": 5800, + "麼": 5801, + "eo": 5802, + "eul": 5803, + "eun": 5804, + "eon": 5805, + "ae": 5806, + "yeon": 5807, + "yeo": 5808, + "ui": 5809, + "hae": 5810, + "geo": 5811, + "neun": 5812, + "ssda": 5813, + "seo": 5814, + "eong": 5815, + "kk": 5816, + "jeo": 5817, + "deul": 5818, + "eum": 5819, + "yeong": 5820, + "geos": 5821, + "hag": 5822, + "aneun": 5823, + "iss": 5824, + "dae": 5825, + "eob": 5826, + "eol": 5827, + "geu": 5828, + "jeong": 5829, + "sae": 5830, + "doe": 5831, + "geul": 5832, + "eulo": 5833, + "bn": 5834, + "sang": 5835, + "bnida": 5836, + "haneun": 5837, + "jeog": 5838, + "saeng": 5839, + "ineun": 5840, + "anh": 5841, + "salam": 5842, + "eom": 5843, + "nae": 5844, + "gwa": 5845, + "yeol": 5846, + "eseo": 5847, + "myeon": 5848, + "ttae": 5849, + "hw": 5850, + "eobs": 5851, + "jang": 5852, + "gw": 5853, + "ileul": 5854, + "yeog": 5855, + "jeon": 5856, + "sig": 5857, + "jag": 5858, + "hago": 5859, + "deun": 5860, + "seong": 5861, + "gag": 5862, + "ham": 5863, + "dang": 5864, + "leul": 5865, + "sil": 5866, + "dong": 5867, + "handa": 5868, + "eossda": 5869, + "aeg": 5870, + "seon": 5871, + "haessda": 5872, + "issda": 5873, + "ege": 5874, + "mul": 5875, + "jung": 5876, + "jig": 5877, + "issneun": 5878, + "geun": 5879, + "seubnida": 5880, + "won": 5881, + "daneun": 5882, + "eoh": 5883, + "deo": 5884, + "gam": 5885, + "jal": 5886, + "haeng": 5887, + "yang": 5888, + "bang": 5889, + "jae": 5890, + "saenggag": 5891, + "hage": 5892, + "sog": 5893, + "eoss": 5894, + "jasin": 5895, + "jil": 5896, + "eog": 5897, + "gyeong": 5898, + "gong": 5899, + "deon": 5900, + "haess": 5901, + "eung": 5902, + "joh": 5903, + "nal": 5904, + "myeong": 5905, + "eona": 5906, + "igo": 5907, + "gyeol": 5908, + "yag": 5909, + "gwan": 5910, + "uli": 5911, + "yong": 5912, + "lyeo": 5913, + "jog": 5914, + "eohge": 5915, + "bog": 5916, + "tong": 5917, + "manh": 5918, + "jeol": 5919, + "geol": 5920, + "aga": 5921, + "naneun": 5922, + "uneun": 5923, + "cheol": 5924, + "dol": 5925, + "bad": 5926, + "hamyeon": 5927, + "yeossda": 5928, + "ibnida": 5929, + "gye": 5930, + "eos": 5931, + "hwal": 5932, + "salamdeul": 5933, + "jiman": 5934, + "dangsin": 5935, + "jib": 5936, + "ttaemun": 5937, + "ib": 5938, + "eneun": 5939, + "eug": 5940, + "jeom": 5941, + "geuleon": 5942, + "hwa": 5943, + "assda": 5944, + "beob": 5945, + "bae": 5946, + "yeoss": 5947, + "chin": 5948, + "chaeg": 5949, + "geon": 5950, + "naega": 5951, + "iga": 5952, + "sigan": 5953, + "gil": 5954, + "hyeon": 5955, + "lyeog": 5956, + "gug": 5957, + "pyeon": 5958, + "wae": 5959, + "jul": 5960, + "seul": 5961, + "deung": 5962, + "hajiman": 5963, + "eumyeon": 5964, + "pil": 5965, + "nyeon": 5966, + "tae": 5967, + "pyo": 5968, + "jineun": 5969, + "beon": 5970, + "hada": 5971, + "seol": 5972, + "sip": 5973, + "daleun": 5974, + "salm": 5975, + "gyo": 5976, + "cheon": 5977, + "hagi": 5978, + "cheoleom": 5979, + "gal": 5980, + "ila": 5981, + "kkaji": 5982, + "anhneun": 5983, + "habnida": 5984, + "tteon": 5985, + "haeseo": 5986, + "doenda": 5987, + "ttal": 5988, + "ilo": 5989, + "seub": 5990, + "byeon": 5991, + "myeo": 5992, + "beol": 5993, + "jeung": 5994, + "chim": 5995, + "hwang": 5996, + "euneun": 5997, + "jong": 5998, + "boda": 5999, + "nol": 6000, + "neom": 6001, + "buteo": 6002, + "jigeum": 6003, + "eobsda": 6004, + "daelo": 6005, + "yul": 6006, + "pyeong": 6007, + "seoneun": 6008, + "salang": 6009, + "seut": 6010, + "heom": 6011, + "hyang": 6012, + "gwang": 6013, + "eobsneun": 6014, + "hwag": 6015, + "gess": 6016, + "jagi": 6017, + "ileon": 6018, + "wihae": 6019, + "daehan": 6020, + "gaji": 6021, + "meog": 6022, + "jyeo": 6023, + "chaj": 6024, + "byeong": 6025, + "eod": 6026, + "gyeo": 6027, + "eoji": 6028, + "gul": 6029, + "modeun": 6030, + "insaeng": 6031, + "geulae": 6032, + "sasil": 6033, + "sib": 6034, + "chal": 6035, + "ilago": 6036, + "geum": 6037, + "doeneun": 6038, + "bol": 6039, + "gajang": 6040, + "geuligo": 6041, + "hyeong": 6042, + "haengbog": 6043, + "chul": 6044, + "chae": 6045, + "mang": 6046, + "dam": 6047, + "choe": 6048, + "sijag": 6049, + "cheong": 6050, + "ilaneun": 6051, + "ulineun": 6052, + "aen": 6053, + "kke": 6054, + "munje": 6055, + "teu": 6056, + "geuneun": 6057, + "bge": 6058, + "cheo": 6059, + "baeg": 6060, + "jug": 6061, + "sangdae": 6062, + "geugeos": 6063, + "dog": 6064, + "eus": 6065, + "jab": 6066, + "hyeo": 6067, + "tteohge": 6068, + "chil": 6069, + "swi": 6070, + "jileul": 6071, + "chang": 6072, + "ganeun": 6073, + "iji": 6074, + "dago": 6075, + "yohan": 6076, + "teug": 6077, + "ppun": 6078, + "aleul": 6079, + "haengdong": 6080, + "sesang": 6081, + "edo": 6082, + "mandeul": 6083, + "amyeon": 6084, + "kkae": 6085, + "bag": 6086, + "ideul": 6087, + "pum": 6088, + "meol": 6089, + "neul": 6090, + "hamkke": 6091, + "chung": 6092, + "dab": 6093, + "yug": 6094, + "sag": 6095, + "gwangye": 6096, + "ileohge": 6097, + "balo": 6098, + "neunde": 6099, + "hamyeo": 6100, + "geuleoh": 6101, + "anila": 6102, + "bangbeob": 6103, + "dasi": 6104, + "byeol": 6105, + "gyeon": 6106, + "gamjeong": 6107, + "oneul": 6108, + "janeun": 6109, + "yeom": 6110, + "lago": 6111, + "igi": 6112, + "hwan": 6113, + "teul": 6114, + "eoseo": 6115, + "sik": 6116, + "jaga": 6117, + "geuleom": 6118, + "geuleona": 6119, + "jeongdo": 6120, + "gyeog": 6121, + "geuleohge": 6122, + "geudeul": 6123, + "eut": 6124, + "imyeon": 6125, + "jjae": 6126, + "keun": 6127, + "isang": 6128, + "malhaessda": 6129, + "euge": 6130, + "nop": 6131, + "ingan": 6132, + "bomyeon": 6133, + "taeg": 6134, + "dwi": 6135, + "saneun": 6136, + "wan": 6137, + "anhgo": 6138, + "nugu": 6139, + "sung": 6140, + "damyeon": 6141, + "adeul": 6142, + "peul": 6143, + "ttala": 6144, + "geosdo": 6145, + "aji": 6146, + "meon": 6147, + "eumyeo": 6148, + "dolog": 6149, + "neung": 6150, + "modu": 6151, + "[ko]": 6152, + "\u0014": 6153, + "\u0016": 6154, + "$": 6155, + "*": 6156, + "|": 6157, + "°": 6158, + "º": 6159, + "ँ": 6160, + "ं": 6161, + "ः": 6162, + "अ": 6163, + "आ": 6164, + "इ": 6165, + "ई": 6166, + "उ": 6167, + "ऊ": 6168, + "ऋ": 6169, + "ऎ": 6170, + "ए": 6171, + "ऐ": 6172, + "ऑ": 6173, + "ऒ": 6174, + "ओ": 6175, + "औ": 6176, + "क": 6177, + "ख": 6178, + "ग": 6179, + "घ": 6180, + "ङ": 6181, + "च": 6182, + "छ": 6183, + "ज": 6184, + "झ": 6185, + "ञ": 6186, + "ट": 6187, + "ठ": 6188, + "ड": 6189, + "ढ": 6190, + "ण": 6191, + "त": 6192, + "थ": 6193, + "द": 6194, + "ध": 6195, + "न": 6196, + "ऩ": 6197, + "प": 6198, + "फ": 6199, + "ब": 6200, + "भ": 6201, + "म": 6202, + "य": 6203, + "र": 6204, + "ऱ": 6205, + "ल": 6206, + "ळ": 6207, + "व": 6208, + "श": 6209, + "ष": 6210, + "स": 6211, + "ह": 6212, + "़": 6213, + "ा": 6214, + "ि": 6215, + "ी": 6216, + "ु": 6217, + "ू": 6218, + "ृ": 6219, + "ॄ": 6220, + "ॅ": 6221, + "ॆ": 6222, + "े": 6223, + "ै": 6224, + "ॉ": 6225, + "ॊ": 6226, + "ो": 6227, + "ौ": 6228, + "्": 6229, + "ॐ": 6230, + "ॖ": 6231, + "क़": 6232, + "ख़": 6233, + "ग़": 6234, + "ज़": 6235, + "ड़": 6236, + "ढ़": 6237, + "फ़": 6238, + "य़": 6239, + "ॠ": 6240, + "।": 6241, + "॥": 6242, + "०": 6243, + "१": 6244, + "२": 6245, + "३": 6246, + "४": 6247, + "५": 6248, + "६": 6249, + "७": 6250, + "८": 6251, + "९": 6252, + "॰": 6253, + "ॲ": 6254, + "​": 6255, + "‌": 6256, + "‍": 6257, + "‎": 6258, + "₹": 6259, + "के": 6260, + "है": 6261, + "ें": 6262, + "्र": 6263, + "ार": 6264, + "ने": 6265, + "या": 6266, + "में": 6267, + "से": 6268, + "की": 6269, + "का": 6270, + "ों": 6271, + "ता": 6272, + "कर": 6273, + "स्": 6274, + "कि": 6275, + "को": 6276, + "र्": 6277, + "ना": 6278, + "क्": 6279, + "ही": 6280, + "और": 6281, + "पर": 6282, + "ते": 6283, + "हो": 6284, + "प्र": 6285, + "ान": 6286, + "्य": 6287, + "ला": 6288, + "वा": 6289, + "ले": 6290, + "सा": 6291, + "हैं": 6292, + "लि": 6293, + "जा": 6294, + "हा": 6295, + "भी": 6296, + "वि": 6297, + "इस": 6298, + "ती": 6299, + "न्": 6300, + "रा": 6301, + "मा": 6302, + "दे": 6303, + "दि": 6304, + "बा": 6305, + "ति": 6306, + "था": 6307, + "नि": 6308, + "कार": 6309, + "एक": 6310, + "हीं": 6311, + "हु": 6312, + "ंग": 6313, + "ैं": 6314, + "नी": 6315, + "सी": 6316, + "अप": 6317, + "त्": 6318, + "नहीं": 6319, + "री": 6320, + "मे": 6321, + "मु": 6322, + "ित": 6323, + "तो": 6324, + "पा": 6325, + "ली": 6326, + "लिए": 6327, + "गा": 6328, + "ल्": 6329, + "रह": 6330, + "रे": 6331, + "क्ष": 6332, + "मैं": 6333, + "सम": 6334, + "उस": 6335, + "जि": 6336, + "त्र": 6337, + "मि": 6338, + "चा": 6339, + "ोग": 6340, + "सं": 6341, + "द्": 6342, + "सि": 6343, + "आप": 6344, + "तु": 6345, + "दा": 6346, + "कु": 6347, + "यों": 6348, + "वे": 6349, + "जी": 6350, + "्या": 6351, + "उन": 6352, + "िक": 6353, + "ये": 6354, + "भा": 6355, + "्ट": 6356, + "हम": 6357, + "स्ट": 6358, + "शा": 6359, + "ड़": 6360, + "ंद": 6361, + "खा": 6362, + "म्": 6363, + "श्": 6364, + "यह": 6365, + "सक": 6366, + "पू": 6367, + "किया": 6368, + "अपने": 6369, + "रू": 6370, + "सु": 6371, + "मी": 6372, + "हि": 6373, + "जो": 6374, + "थे": 6375, + "रि": 6376, + "दी": 6377, + "थी": 6378, + "गी": 6379, + "लोग": 6380, + "गया": 6381, + "तर": 6382, + "न्ह": 6383, + "च्": 6384, + "वार": 6385, + "बी": 6386, + "प्": 6387, + "दो": 6388, + "टी": 6389, + "शि": 6390, + "करने": 6391, + "गे": 6392, + "ैसे": 6393, + "इन": 6394, + "ंड": 6395, + "साथ": 6396, + "पु": 6397, + "बे": 6398, + "बार": 6399, + "वी": 6400, + "अन": 6401, + "हर": 6402, + "उन्ह": 6403, + "होता": 6404, + "जब": 6405, + "कुछ": 6406, + "मान": 6407, + "क्र": 6408, + "बि": 6409, + "पह": 6410, + "फि": 6411, + "सर": 6412, + "ारी": 6413, + "रो": 6414, + "दू": 6415, + "कहा": 6416, + "तक": 6417, + "शन": 6418, + "ब्": 6419, + "स्थ": 6420, + "वह": 6421, + "बाद": 6422, + "ओं": 6423, + "गु": 6424, + "ज्": 6425, + "्रे": 6426, + "गर": 6427, + "रहे": 6428, + "वर्": 6429, + "हू": 6430, + "ार्": 6431, + "पी": 6432, + "बहु": 6433, + "मुझ": 6434, + "्रा": 6435, + "दिया": 6436, + "सब": 6437, + "करते": 6438, + "अपनी": 6439, + "बहुत": 6440, + "कह": 6441, + "टे": 6442, + "हुए": 6443, + "किसी": 6444, + "रहा": 6445, + "ष्ट": 6446, + "ज़": 6447, + "बना": 6448, + "सो": 6449, + "डि": 6450, + "कोई": 6451, + "व्य": 6452, + "बात": 6453, + "रु": 6454, + "वो": 6455, + "मुझे": 6456, + "द्ध": 6457, + "चार": 6458, + "मेरे": 6459, + "वर": 6460, + "्री": 6461, + "जाता": 6462, + "नों": 6463, + "प्रा": 6464, + "देख": 6465, + "टा": 6466, + "क्या": 6467, + "अध": 6468, + "लग": 6469, + "लो": 6470, + "पि": 6471, + "यु": 6472, + "चे": 6473, + "जिस": 6474, + "ंत": 6475, + "ानी": 6476, + "पै": 6477, + "जन": 6478, + "ारे": 6479, + "ची": 6480, + "मिल": 6481, + "दु": 6482, + "देश": 6483, + "च्छ": 6484, + "ष्": 6485, + "सू": 6486, + "खे": 6487, + "चु": 6488, + "िया": 6489, + "लगा": 6490, + "बु": 6491, + "उनके": 6492, + "ज्ञ": 6493, + "क्षा": 6494, + "तरह": 6495, + "्यादा": 6496, + "वाले": 6497, + "पूर्": 6498, + "मैंने": 6499, + "काम": 6500, + "रूप": 6501, + "होती": 6502, + "उप": 6503, + "जान": 6504, + "प्रकार": 6505, + "भार": 6506, + "मन": 6507, + "हुआ": 6508, + "टर": 6509, + "हूँ": 6510, + "परि": 6511, + "पास": 6512, + "अनु": 6513, + "राज": 6514, + "लोगों": 6515, + "अब": 6516, + "समझ": 6517, + "डी": 6518, + "मौ": 6519, + "शु": 6520, + "चि": 6521, + "पे": 6522, + "कृ": 6523, + "सकते": 6524, + "मह": 6525, + "योग": 6526, + "दर्": 6527, + "उसे": 6528, + "ंध": 6529, + "डा": 6530, + "जाए": 6531, + "बो": 6532, + "ूल": 6533, + "मो": 6534, + "ोंने": 6535, + "ंस": 6536, + "तुम": 6537, + "पहले": 6538, + "बता": 6539, + "तथा": 6540, + "यो": 6541, + "गई": 6542, + "उत्": 6543, + "सकता": 6544, + "कम": 6545, + "ज्यादा": 6546, + "रख": 6547, + "समय": 6548, + "ारा": 6549, + "अगर": 6550, + "स्त": 6551, + "चल": 6552, + "फिर": 6553, + "वारा": 6554, + "करना": 6555, + "शी": 6556, + "गए": 6557, + "बन": 6558, + "ौर": 6559, + "होने": 6560, + "चाह": 6561, + "खु": 6562, + "हाँ": 6563, + "उन्हें": 6564, + "उन्होंने": 6565, + "छो": 6566, + "म्ह": 6567, + "प्रति": 6568, + "निक": 6569, + "वन": 6570, + "्यू": 6571, + "रही": 6572, + "तुम्ह": 6573, + "जैसे": 6574, + "ियों": 6575, + "क्यों": 6576, + "लों": 6577, + "फ़": 6578, + "ंत्र": 6579, + "होते": 6580, + "क्ति": 6581, + "त्य": 6582, + "कर्": 6583, + "कई": 6584, + "वं": 6585, + "किन": 6586, + "पो": 6587, + "कारण": 6588, + "ड़ी": 6589, + "भि": 6590, + "इसके": 6591, + "बर": 6592, + "उसके": 6593, + "द्वारा": 6594, + "शे": 6595, + "कॉ": 6596, + "दिन": 6597, + "न्न": 6598, + "ड़ा": 6599, + "स्व": 6600, + "निर्": 6601, + "मुख": 6602, + "लिया": 6603, + "टि": 6604, + "ज्ञान": 6605, + "क्त": 6606, + "द्र": 6607, + "ग्": 6608, + "क्स": 6609, + "मै": 6610, + "गो": 6611, + "जे": 6612, + "ट्र": 6613, + "मार": 6614, + "त्व": 6615, + "धार": 6616, + "भाव": 6617, + "करता": 6618, + "खि": 6619, + "कं": 6620, + "चाहि": 6621, + "यर": 6622, + "प्त": 6623, + "कों": 6624, + "ंच": 6625, + "जु": 6626, + "मत": 6627, + "अच्छ": 6628, + "हुई": 6629, + "कभी": 6630, + "लेकिन": 6631, + "भू": 6632, + "अपना": 6633, + "दूस": 6634, + "चाहिए": 6635, + "यू": 6636, + "घर": 6637, + "सबसे": 6638, + "मेरी": 6639, + "नाम": 6640, + "ढ़": 6641, + "ंट": 6642, + "ेंगे": 6643, + "बै": 6644, + "फा": 6645, + "एवं": 6646, + "यी": 6647, + "ग्र": 6648, + "क्षे": 6649, + "आज": 6650, + "आपको": 6651, + "भाग": 6652, + "ठा": 6653, + "कै": 6654, + "भारत": 6655, + "उनकी": 6656, + "पहु": 6657, + "सभी": 6658, + "धा": 6659, + "णा": 6660, + "सान": 6661, + "होगा": 6662, + "तब": 6663, + "संग": 6664, + "पर्": 6665, + "अव": 6666, + "तना": 6667, + "गि": 6668, + "यन": 6669, + "स्था": 6670, + "चित": 6671, + "ट्": 6672, + "छा": 6673, + "जाने": 6674, + "क्षेत्र": 6675, + "वाली": 6676, + "पूर्ण": 6677, + "समा": 6678, + "कारी": 6679, + "[hi]": 6680 + }, + "merges": [ + "t h", + "i n", + "th e", + "a n", + "e r", + "o u", + "r e", + "o n", + "a t", + "e d", + "e n", + "t o", + "in g", + "an d", + "i s", + "a s", + "a l", + "o r", + "o f", + "a r", + "i t", + "e s", + "h e", + "s t", + "l e", + "o m", + "s e", + "b e", + "a d", + "o w", + "l y", + "c h", + "w h", + "th at", + "y ou", + "l i", + "v e", + "a c", + "t i", + "l d", + "m e", + "w as", + "g h", + "i d", + "l l", + "w i", + "en t", + "f or", + "a y", + "r o", + "v er", + "i c", + "h er", + "k e", + "h is", + "n o", + "u t", + "u n", + "i r", + "l o", + "w e", + "r i", + "h a", + "wi th", + "gh t", + "ou t", + "i m", + "i on", + "al l", + "a b", + "on e", + "n e", + "g e", + "ou ld", + "t er", + "m o", + "h ad", + "c e", + "s he", + "g o", + "s h", + "u r", + "a m", + "s o", + "p e", + "m y", + "d e", + "a re", + "b ut", + "om e", + "f r", + "the r", + "f e", + "s u", + "d o", + "c on", + "t e", + "a in", + "er e", + "p o", + "i f", + "the y", + "u s", + "a g", + "t r", + "n ow", + "ou n", + "th is", + "ha ve", + "no t", + "s a", + "i l", + "u p", + "th ing", + "fr om", + "a p", + "h im", + "ac k", + "at ion", + "an t", + "ou r", + "o p", + "li ke", + "u st", + "es s", + "b o", + "o k", + "u l", + "in d", + "e x", + "c om", + "s ome", + "the re", + "er s", + "c o", + "re s", + "m an", + "ar d", + "p l", + "w or", + "w ay", + "ti on", + "f o", + "c a", + "w ere", + "b y", + "at e", + "p ro", + "t ed", + "oun d", + "ow n", + "w ould", + "t s", + "wh at", + "q u", + "al ly", + "i ght", + "c k", + "g r", + "wh en", + "v en", + "c an", + "ou gh", + "in e", + "en d", + "p er", + "ou s", + "o d", + "id e", + "k now", + "t y", + "ver y", + "s i", + "a k", + "wh o", + "ab out", + "i ll", + "the m", + "es t", + "re d", + "y e", + "c ould", + "on g", + "you r", + "the ir", + "e m", + "j ust", + "o ther", + "in to", + "an y", + "wh i", + "u m", + "t w", + "as t", + "d er", + "d id", + "i e", + "be en", + "ac e", + "in k", + "it y", + "b ack", + "t ing", + "b r", + "mo re", + "a ke", + "p p", + "the n", + "s p", + "e l", + "u se", + "b l", + "sa id", + "o ver", + "ge t", + "e n", + "e r", + "c h", + "e i", + "i e", + "u n", + "i ch", + "ei n", + "s t", + "a n", + "t e", + "g e", + "a u", + "i n", + "s ch", + "d er", + "un d", + "d ie", + "d a", + "e s", + "a l", + "d en", + "a r", + "g en", + "z u", + "d e", + "h r", + "o n", + "t en", + "e l", + "o r", + "m i", + "s ie", + "da s", + "a t", + "b e", + "ein e", + "ich t", + "b er", + "l e", + "a ch", + "v er", + "s e", + "au f", + "w i", + "s o", + "t er", + "l ich", + "c k", + "u r", + "n icht", + "m m", + "b en", + "a s", + "w ar", + "r e", + "mi t", + "s ich", + "i g", + "l l", + "au s", + "i st", + "w ie", + "o ch", + "un g", + "an n", + "ü r", + "h n", + "i hr", + "s a", + "s en", + "t z", + "de m", + "ei t", + "u m", + "h at", + "wi r", + "v on", + "h a", + "s p", + "w ei", + "i er", + "r o", + "h er", + "r a", + "ein en", + "n e", + "v or", + "al s", + "an d", + "al l", + "w as", + "w o", + "r ei", + "st e", + "l ie", + "au ch", + "d u", + "d es", + "k o", + "ü ber", + "a m", + "b ei", + "h en", + "h m", + "l ei", + "a ber", + "w en", + "h l", + "g er", + "i m", + "u t", + "n ach", + "h e", + "i s", + "b r", + "f t", + "en t", + "i mm", + "j e", + "sch en", + "w er", + "s er", + "a b", + "ä n", + "m e", + "s ein", + "i t", + "o l", + "ch t", + "f ür", + "k l", + "f f", + "eine m", + "n en", + "w e", + "j a", + "u s", + "n och", + "hat te", + "t r", + "p f", + "h in", + "d i", + "ch en", + "b l", + "m an", + "r ü", + "ie l", + "s el", + "das s", + "i hn", + "mi r", + "sch l", + "ö n", + "g an", + "g t", + "ein er", + "st en", + "m ich", + "wen n", + "el l", + "g te", + "in d", + "m al", + "ge l", + "k en", + "n ur", + "mm en", + "f ü", + "er n", + "ö r", + "un ter", + "f r", + "an der", + "g r", + "i l", + "d ur", + "u ch", + "f e", + "t a", + "m en", + "m ach", + "d och", + "t i", + "dur ch", + "o s", + "g l", + "h al", + "ihr e", + "w ä", + "imm er", + "i hm", + "k ann", + "or t", + "d ann", + "l an", + "tz t", + "o der", + "hr en", + "e t", + "k ön", + "i ck", + "f a", + "in g", + "i r", + "wie der", + "da ß", + "m ein", + "f en", + "gan z", + "die se", + "st er", + "da r", + "w a", + "ge s", + "n a", + "f l", + "i gen", + "sch e", + "un gen", + "me hr", + "ß en", + "o t", + "k on", + "ge w", + "ha ben", + "ge h", + "ä t", + "s ind", + "d r", + "w el", + "un s", + "v o", + "m a", + "u te", + "sch on", + "b es", + "ge sch", + "b t", + "ch e", + "s on", + "o b", + "l a", + "p p", + "rü ck", + "s eine", + "k r", + "f re", + "ei l", + "zu m", + "u l", + "h ier", + "k t", + "i ge", + "sp r", + "k e", + "le ben", + "b st", + "z eit", + "i on", + "g ro", + "den n", + "h o", + "sch a", + "b ar", + "al le", + "ge gen", + "w ür", + "m ü", + "z e", + "wer den", + "je tzt", + "ko mmen", + "n ie", + "s ei", + "h eit", + "so ll", + "g lei", + "m eine", + "wo ll", + "n er", + "ha be", + "w ur", + "lich en", + "p er", + "as sen", + "n te", + "se hen", + "wir d", + "b is", + "g ar", + "i en", + "m us", + "u ß", + "ä r", + "st ell", + "k eit", + "z wei", + "sel bst", + "st a", + "p a", + "sa gte", + "te t", + "k am", + "s sen", + "v iel", + "u g", + "z en", + "h ei", + "m ann", + "wi ll", + "ge b", + "war en", + "ü ck", + "ä ch", + "m er", + "r u", + "w or", + "h au", + "ei gen", + "an g", + "we g", + "bl ick", + "f ra", + "all es", + "k a", + "au gen", + "f in", + "lich e", + "t o", + "un ser", + "der n", + "her r", + "n un", + "v ie", + "ch te", + "wo hl", + "f all", + "h t", + "ü n", + "et was", + "st and", + "en d", + "ä u", + "e m", + "m ö", + "te l", + "r ie", + "d ich", + "die s", + "h and", + "b in", + "ff en", + "nicht s", + "d an", + "p l", + "hn e", + "ihn en", + "es en", + "die ser", + "fr au", + "an t", + "ar t", + "di r", + "i sch", + "er st", + "glei ch", + "ko mm", + "h ör", + "ß e", + "d ig", + "se hr", + "z ei", + "sa m", + "au m", + "h ät", + "in gen", + "g ut", + "b o", + "m ut", + "ck en", + "kon nte", + "st imm", + "p ro", + "zu r", + "i tz", + "wei l", + "wür de", + "f ä", + "kön nen", + "k eine", + "f er", + "i schen", + "vo ll", + "ein es", + "se tz", + "z ie", + "de l", + "te te", + "sein er", + "ier en", + "ge st", + "zu rück", + "wur de", + "sch n", + "p r", + "lie ß", + "t ra", + "m ä", + "gen d", + "f ol", + "i k", + "schl a", + "scha ft", + "at er", + "wei ß", + "s einen", + "l assen", + "l u", + "und en", + "t eil", + "ne u", + "ier t", + "men schen", + "hm en", + "st r", + "g i", + "sa h", + "ihr en", + "el n", + "wei ter", + "ge hen", + "ig er", + "mach t", + "ta g", + "al so", + "hal ten", + "n is", + "ach t", + "ge ben", + "f or", + "o g", + "n at", + "m ar", + "de t", + "o hne", + "h aus", + "t ro", + "an ge", + "l au", + "sp iel", + "t re", + "sch r", + "in n", + "s u", + "l os", + "mach en", + "hät te", + "be g", + "wir k", + "al t", + "g lich", + "te s", + "r icht", + "fre und", + "m o", + "ihr er", + "f el", + "b el", + "so l", + "ein mal", + "e ben", + "h ol", + "h än", + "q u", + "ter n", + "h ö", + "sch w", + "re cht", + "wa hr", + "s einem", + "ste hen", + "hl en", + "in s", + "g ing", + "woll te", + "wi ssen", + "ung s", + "al d", + "as s", + "ja hr", + "m or", + "wel t", + "un der", + "zu sa", + "at ion", + "ko pf", + "lan g", + "hin ter", + "at z", + "st ra", + "an gen", + "an k", + "a de", + "gl au", + "f ach", + "hat ten", + "l o", + "f ort", + "ei cht", + "i ff", + "l er", + "m ei", + "diese m", + "k ein", + "f rei", + "fü hr", + "vo m", + "e s", + "e n", + "a i", + "o u", + "o n", + "l e", + "d e", + "r e", + "q u", + "a n", + "e r", + "en t", + "e t", + "l a", + "n e", + "i l", + "a r", + "i s", + "ai t", + "t e", + "a u", + "i n", + "qu e", + "i t", + "u r", + "s e", + "l es", + "c h", + "c e", + "m e", + "o r", + "ou r", + "a s", + "p r", + "a v", + "o m", + "ai s", + "u n", + "an t", + "ou s", + "t r", + "t i", + "l u", + "o i", + "e u", + "l le", + "s i", + "p ar", + "d es", + "an s", + "m ent", + "é t", + "es t", + "j e", + "u ne", + "a l", + "p as", + "t re", + "qu i", + "d u", + "r i", + "c on", + "s on", + "c om", + "e lle", + "d é", + "p our", + "d ans", + "l i", + "s a", + "r é", + "t ou", + "v ous", + "d i", + "v i", + "a g", + "a m", + "a t", + "ou v", + "a p", + "ti on", + "m on", + "s ur", + "c i", + "o s", + "p lu", + "s u", + "en d", + "a b", + "è re", + "ai n", + "m ais", + "o is", + "r es", + "plu s", + "é e", + "ai ent", + "m p", + "ch e", + "lu i", + "av e", + "ét ait", + "m a", + "s es", + "tou t", + "i r", + "v o", + "a c", + "s er", + "an d", + "f f", + "oi r", + "g r", + "av ait", + "é s", + "m es", + "n ous", + "eu x", + "b i", + "t er", + "c o", + "on s", + "p u", + "c es", + "g e", + "t u", + "le ur", + "pr o", + "d on", + "e ur", + "et te", + "ai re", + "ave c", + "d it", + "t é", + "i e", + "u s", + "il le", + "p er", + "com me", + "c r", + "or t", + "m i", + "e x", + "u x", + "v er", + "m o", + "è s", + "v e", + "au x", + "r a", + "j our", + "il s", + "bi en", + "c ou", + "p e", + "que l", + "p eu", + "c ette", + "t es", + "p o", + "in s", + "c u", + "m ê", + "s o", + "f ait", + "g u", + "m ar", + "ê tre", + "l o", + "it é", + "f r", + "a tion", + "en s", + "b r", + "n i", + "l é", + "d is", + "b le", + "m an", + "n é", + "pu is", + "mê me", + "qu es", + "f i", + "e l", + "ag e", + "g ar", + "m oi", + "en ce", + "on t", + "m ain", + "or s", + "au t", + "an ce", + "v en", + "m é", + "s ans", + "e m", + "s é", + "l on", + "h om", + "r o", + "u t", + "c ar", + "ab le", + "i m", + "de r", + "ch er", + "n o", + "vi e", + "au s", + "b e", + "de ux", + "en f", + "o ù", + "t en", + "p h", + "u re", + "te mp", + "p os", + "r ent", + "p é", + "f aire", + "p i", + "tr es", + "ç a", + "an g", + "end re", + "f or", + "p a", + "b on", + "s ou", + "in t", + "pr é", + "s ent", + "t ant", + "n er", + "c er", + "l à", + "l ais", + "pr ès", + "b re", + "c our", + "p et", + "i on", + "i ne", + "com p", + "l ait", + "tr ouv", + "t a", + "ent re", + "son t", + "de v", + "n u", + "temp s", + "d ou", + "r ait", + "b ou", + "qu and", + "jour s", + "l an", + "er s", + "av oir", + "ét é", + "a le", + "p re", + "f ois", + "or te", + "v é", + "m er", + "n on", + "t ous", + "j us", + "cou p", + "t s", + "hom me", + "ê te", + "a d", + "aus si", + "ur s", + "se u", + "or d", + "o b", + "m in", + "g é", + "co re", + "v a", + "v re", + "en core", + "se m", + "i te", + "au tre", + "pr is", + "peu t", + "u e", + "an te", + "m al", + "g n", + "ré p", + "h u", + "si on", + "vo tre", + "di re", + "e z", + "f em", + "leur s", + "m et", + "f in", + "c ri", + "m is", + "t our", + "r ai", + "j am", + "re gar", + "ri en", + "ver s", + "su is", + "p ouv", + "o p", + "v is", + "gr and", + "ant s", + "c or", + "re r", + "ar d", + "c é", + "t ent", + "pr es", + "v ou", + "f a", + "al ors", + "si eur", + "ai ne", + "le r", + "qu oi", + "f on", + "end ant", + "ar ri", + "eu re", + "a près", + "don c", + "it u", + "l è", + "s ait", + "t oi", + "ch a", + "ai l", + "as se", + "i mp", + "vo y", + "con n", + "p la", + "pet it", + "av ant", + "n om", + "t in", + "don t", + "d a", + "s ous", + "e mp", + "per son", + "el les", + "be au", + "par ti", + "ch o", + "pr it", + "tou jours", + "m en", + "r ais", + "jam ais", + "tr av", + "tion s", + "tr ès", + "v oi", + "r en", + "y eux", + "f er", + "v oir", + "pre mi", + "c a", + "g ne", + "h eure", + "r ou", + "e ff", + "no tre", + "ment s", + "t on", + "f ais", + "ce la", + "i er", + "rép on", + "con s", + "ai r", + "ô t", + "p endant", + "i ci", + "tou te", + "j et", + "p ort", + "ét aient", + "p en", + "h é", + "au tres", + "p ère", + "o c", + "quel ques", + "i que", + "l is", + "fem me", + "j ou", + "te ur", + "mon de", + "u se", + "n es", + "d re", + "a ff", + "r ap", + "par t", + "le ment", + "c la", + "f ut", + "quel que", + "pr endre", + "r ê", + "ai lle", + "s ais", + "ch es", + "le t", + "ch ar", + "è res", + "ent s", + "b er", + "g er", + "mo ins", + "e au", + "a î", + "j eu", + "h eur", + "é es", + "tr i", + "po int", + "m om", + "v ent", + "n ouv", + "gr an", + "tr ois", + "s ant", + "tout es", + "con tre", + "è rent", + "che z", + "ave z", + "û t", + "a lle", + "at t", + "p au", + "p orte", + "ouv er", + "b ar", + "l it", + "f ort", + "o t", + "as s", + "pr és", + "cho se", + "v it", + "mon sieur", + "h ab", + "t ête", + "j u", + "te ment", + "c tion", + "v rai", + "la r", + "c et", + "regar d", + "l ant", + "de m", + "s om", + "mom ent", + "il les", + "p le", + "p s", + "b es", + "m ère", + "c l", + "s our", + "y s", + "tr op", + "en ne", + "jus qu", + "av aient", + "av ais", + "jeu ne", + "de puis", + "person ne", + "f it", + "cer t", + "j o", + "g es", + "ou i", + "r est", + "sem b", + "c ap", + "m at", + "m u", + "lon g", + "fr an", + "f aut", + "it i", + "b li", + "che v", + "pr i", + "ent e", + "ain si", + "ch am", + "l ors", + "c as", + "d o", + "il i", + "b é", + "n os", + "an ge", + "su i", + "r it", + "cr o", + "gu e", + "d e", + "e n", + "e s", + "o s", + "l a", + "e r", + "q u", + "a r", + "a n", + "o n", + "qu e", + "a s", + "o r", + "e l", + "d o", + "a l", + "c i", + "u n", + "r e", + "a b", + "i n", + "t e", + "t o", + "s e", + "d i", + "t r", + "d a", + "c on", + "t a", + "s u", + "m i", + "c o", + "t i", + "l e", + "l os", + "n o", + "l o", + "í a", + "c u", + "c a", + "s i", + "v i", + "m e", + "p or", + "m o", + "p ar", + "r a", + "r i", + "la s", + "c h", + "r o", + "m a", + "p er", + "ó n", + "m en", + "de s", + "un a", + "m p", + "s o", + "ab a", + "p u", + "d os", + "t u", + "g u", + "er a", + "de l", + "h a", + "m u", + "l i", + "en t", + "m b", + "h ab", + "es t", + "g o", + "p a", + "r es", + "par a", + "p o", + "á s", + "m os", + "tr a", + "t en", + "an do", + "p i", + "qu i", + "b i", + "m an", + "co mo", + "v e", + "m ás", + "j o", + "ci ón", + "i s", + "t an", + "v o", + "da d", + "c e", + "a do", + "v er", + "f u", + "ci a", + "c er", + "p e", + "c as", + "c ar", + "men te", + "n i", + "su s", + "t ar", + "n a", + "f i", + "t er", + "z a", + "p ro", + "tr o", + "s a", + "l u", + "b a", + "per o", + "s er", + "c es", + "d as", + "d u", + "s in", + "e mp", + "m ar", + "l la", + "e x", + "á n", + "c or", + "i a", + "v a", + "r an", + "ch o", + "g a", + "y o", + "t os", + "c os", + "mi s", + "l es", + "t es", + "v en", + "h o", + "y a", + "en te", + "on es", + "hab ía", + "n u", + "u s", + "p as", + "h i", + "n os", + "es ta", + "la n", + "m as", + "t or", + "l le", + "h e", + "s on", + "b re", + "p re", + "ab an", + "d or", + "í an", + "i r", + "t as", + "é n", + "r u", + "en do", + "a que", + "er o", + "i o", + "qu é", + "m in", + "c ab", + "j a", + "de r", + "t al", + "é s", + "se ñ", + "or a", + "to do", + "la r", + "d on", + "g ar", + "s al", + "p r", + "cu ando", + "j e", + "h u", + "g un", + "b u", + "g i", + "d ar", + "n e", + "r as", + "de n", + "es to", + "par e", + "p en", + "é l", + "tr as", + "c an", + "b o", + "j os", + "mi en", + "pu e", + "c re", + "co mp", + "p on", + "d ía", + "tr os", + "s ab", + "so bre", + "es e", + "mb re", + "er on", + "a ñ", + "m or", + "f or", + "i do", + "por que", + "el la", + "p ri", + "g ran", + "f a", + "c en", + "di s", + "c ri", + "mu y", + "ch a", + "c al", + "es te", + "h as", + "c ó", + "g ra", + "r os", + "p os", + "o b", + "al l", + "aque l", + "j u", + "p res", + "m er", + "di jo", + "c ía", + "ent re", + "z o", + "ci ones", + "bi en", + "mb i", + "el o", + "t ó", + "in a", + "to dos", + "g en", + "ti en", + "est aba", + "de ci", + "ci o", + "h er", + "ñ o", + "l or", + "nu es", + "me di", + "l en", + "vi da", + "f e", + "al i", + "m on", + "c la", + "d re", + "pu es", + "al es", + "vo l", + "m í", + "r ar", + "b le", + "ci on", + "has ta", + "señ or", + "con o", + "a h", + "di os", + "s en", + "es a", + "ú n", + "v ar", + "s an", + "gu i", + "a c", + "o tros", + "ta do", + "bu en", + "ñ a", + "ti emp", + "ha cer", + "j er", + "f er", + "v u", + "f in", + "an a", + "as í", + "an tes", + "t in", + "ve z", + "mien to", + "j ar", + "la b", + "ch e", + "cas a", + "d r", + "es o", + "e go", + "di ó", + "an te", + "est á", + "m al", + "en cia", + "el i", + "í as", + "tiemp o", + "z ar", + "v an", + "m un", + "er ta", + "ta mbi", + "s í", + "b ar", + "a un", + "al e", + "mis mo", + "ent es", + "vi s", + "man o", + "el e", + "na da", + "se gu", + "me j", + "er ra", + "ab le", + "b e", + "ti r", + "un o", + "don de", + "to da", + "des de", + "r en", + "tambi én", + "cu er", + "per son", + "ho mbre", + "o tro", + "li b", + "tr ar", + "cu al", + "ha y", + "a u", + "ca da", + "t aba", + "i mp", + "men to", + "ten ía", + "qu er", + "er an", + "si emp", + "siemp re", + "er to", + "qu í", + "g os", + "pu és", + "el los", + "des pués", + "nu e", + "g an", + "l lo", + "in ter", + "có mo", + "tr i", + "ah ora", + "us te", + "tr aba", + "la do", + "in o", + "po co", + "er te", + "mu jer", + "i m", + "qui er", + "al gun", + "fu e", + "o jos", + "ent on", + "v os", + "es per", + "mu ch", + "o tra", + "a z", + "a d", + "in g", + "e za", + "a quí", + "ci as", + "gu a", + "mu cho", + "deci r", + "es ti", + "i dad", + "al go", + "e z", + "o cu", + "enton ces", + "di do", + "ent os", + "g ri", + "da do", + "i os", + "so l", + "dos e", + "uste d", + "qui en", + "a mi", + "un to", + "f r", + "mi r", + "mej or", + "b as", + "so lo", + "pre gun", + "tu r", + "al g", + "p la", + "to das", + "par te", + "e mb", + "c to", + "mun do", + "tien e", + "tan te", + "pa lab", + "tr an", + "aque lla", + "ci os", + "aun que", + "a y", + "cu en", + "ten er", + "f un", + "res pon", + "all í", + "x i", + "h an", + "pen s", + "con tra", + "tu ra", + "v al", + "di o", + "tr es", + "t re", + "tan to", + "ca min", + "m ó", + "es p", + "a da", + "í o", + "in s", + "ha cia", + "de j", + "est ar", + "i ón", + "g as", + "b er", + "v as", + "no che", + "é r", + "añ os", + "pa dre", + "gu s", + "á r", + "sin o", + "man os", + "ci do", + "es tu", + "a de", + "hu bi", + "vi r", + "b ri", + "ra z", + "ch i", + "pue de", + "men os", + "hab i", + "ho mb", + "ne ces", + "ma y", + "er os", + "r ía", + "he cho", + "es cu", + "l ti", + "án do", + "b us", + "cos as", + "t ú", + "es pa", + "re ci", + "c tor", + "pri m", + "di a", + "de se", + "mien tras", + "h or", + "fu er", + "i da", + "pos i", + "lan te", + "t on", + "an o", + "est as", + "p li", + "ch ar", + "lu ego", + "si ón", + "ci n", + "ti erra", + "m es", + "gu ar", + "ca do", + "en con", + "pr en", + "may or", + "f al", + "e r", + "o n", + "a n", + "t o", + "d i", + "r e", + "l a", + "i n", + "e n", + "a l", + "t a", + "c h", + "e l", + "r i", + "c o", + "t i", + "t e", + "s i", + "r a", + "u n", + "l e", + "l i", + "ch e", + "r o", + "c i", + "c a", + "s e", + "q u", + "m a", + "p o", + "s o", + "i l", + "d o", + "e s", + "v a", + "p er", + "l o", + "c on", + "d el", + "p a", + "m o", + "s a", + "p i", + "d a", + "m i", + "g i", + "s u", + "d e", + "v i", + "z i", + "m e", + "g li", + "n o", + "m en", + "v o", + "t u", + "n on", + "v e", + "t to", + "s t", + "on e", + "an o", + "ch i", + "er a", + "er e", + "f a", + "c e", + "z a", + "un a", + "b i", + "p re", + "s ta", + "o r", + "a r", + "f i", + "on o", + "t ra", + "n a", + "n el", + "n e", + "p ro", + "t ro", + "al e", + "v er", + "n i", + "c u", + "t ti", + "men te", + "del la", + "t er", + "zi one", + "g u", + "p e", + "t ta", + "an do", + "t à", + "al i", + "u o", + "qu el", + "co m", + "s en", + "co me", + "b a", + "al la", + "p ri", + "d u", + "qu es", + "l u", + "on i", + "g gi", + "pa r", + "s si", + "v en", + "in a", + "g a", + "pi ù", + "ci a", + "i m", + "co r", + "m an", + "in o", + "in i", + "t en", + "r an", + "b b", + "g o", + "s to", + "t re", + "a ve", + "a v", + "s ono", + "er i", + "a c", + "s se", + "er o", + "h a", + "s c", + "su l", + "f or", + "v ano", + "po r", + "s ti", + "su o", + "c chi", + "t an", + "z za", + "an che", + "p u", + "i o", + "t te", + "vo l", + "es s", + "s ci", + "co l", + "r u", + "p en", + "f u", + "al l", + "s so", + "s te", + "se m", + "s sa", + "d en", + "a d", + "t ri", + "de i", + "in e", + "ave va", + "men to", + "z z", + "a mo", + "g no", + "f o", + "un o", + "su a", + "g en", + "ri a", + "g e", + "st ra", + "s ì", + "c er", + "ch é", + "b u", + "a p", + "c en", + "d al", + "on a", + "s pe", + "g ni", + "b o", + "t t", + "del le", + "ques to", + "nel la", + "f f", + "d ere", + "an no", + "del l", + "un i", + "bb e", + "an ti", + "g ra", + "s p", + "en e", + "gi o", + "u to", + "qu al", + "gli a", + "qu ando", + "tu tto", + "c an", + "gli o", + "zi oni", + "ca m", + "h o", + "es so", + "s s", + "mo l", + "a t", + "lo ro", + "per ché", + "co sa", + "du e", + "po i", + "ca r", + "s co", + "ci o", + "to r", + "c co", + "c re", + "a m", + "g na", + "te m", + "pri ma", + "lu i", + "co sì", + "qu e", + "gu ar", + "ess ere", + "an i", + "con o", + "b ra", + "al le", + "m on", + "ri o", + "an co", + "cu i", + "s pi", + "vi a", + "g ran", + "gi or", + "a i", + "bi le", + "u l", + "ggi o", + "f e", + "an te", + "ma i", + "ta re", + "in ter", + "in di", + "re bbe", + "sen za", + "so lo", + "zi o", + "e d", + "en te", + "tu tti", + "sta to", + "zi a", + "d alla", + "tu ra", + "mi a", + "vi ta", + "quel la", + "qu a", + "ma r", + "do ve", + "g h", + "al lo", + "sem pre", + "zz o", + "si a", + "mo r", + "do po", + "por ta", + "d re", + "c cia", + "er ano", + "an ni", + "di o", + "chi a", + "en za", + "pro pri", + "qu i", + "m u", + "m b", + "an da", + "c ca", + "o cchi", + "ques ta", + "f fi", + "le i", + "par te", + "d on", + "r on", + "mi o", + "tan to", + "ri s", + "o gni", + "di s", + "r in", + "fa r", + "men ti", + "t el", + "anco ra", + "f ra", + "fa tto", + "man i", + "sen ti", + "p ra", + "tem po", + "es si", + "b bi", + "f in", + "a re", + "la re", + "per s", + "f on", + "b el", + "so r", + "d er", + "pre n", + "an za", + "di re", + "pi e", + "o ra", + "ver so", + "se gu", + "al tro", + "ta to", + "ca to", + "a to", + "vol ta", + "c c", + "fa re", + "pa re", + "ci ò", + "li b", + "bi li", + "n uo", + "s er", + "quel lo", + "co lo", + "p po", + "ca sa", + "tro va", + "o re", + "f er", + "r ono", + "d es", + "mol to", + "al mente", + "s ca", + "vo le", + "t ali", + "sul la", + "s ce", + "men o", + "an to", + "p un", + "s tu", + "ca pi", + "so l", + "gi u", + "m ini", + "m ano", + "z e", + "pi a", + "par ti", + "s al", + "la vo", + "ver o", + "r si", + "al tri", + "es ti", + "s cia", + "suo i", + "gli e", + "so tto", + "b ene", + "sc ri", + "t ale", + "de gli", + "n u", + "al c", + "uo mo", + "p el", + "f re", + "po te", + "es sa", + "s cu", + "si gno", + "el e", + "st ro", + "u ti", + "di a", + "si one", + "g re", + "f ini", + "ar ri", + "l un", + "c ri", + "e si", + "pa ssa", + "r à", + "men tre", + "an d", + "h anno", + "el o", + "u sci", + "gi a", + "gi à", + "di e", + "m ina", + "b e", + "ti ca", + "gior no", + "t in", + "es se", + "mo do", + "c al", + "s pa", + "propri o", + "l en", + "o ri", + "con tro", + "st ru", + "di ven", + "di sse", + "ra to", + "no i", + "v ere", + "pu ò", + "di ce", + "s an", + "es a", + "c ci", + "se con", + "re n", + "c cio", + "qual che", + "tu tta", + "g g", + "mon do", + "for ma", + "p li", + "m ma", + "pen sa", + "de va", + "tu r", + "fo sse", + "so pra", + "ta mente", + "n ess", + "qu anto", + "ra ga", + "un que", + "ca re", + "st re", + "gran de", + "pi cco", + "guar da", + "b en", + "nel l", + "a ff", + "po ssi", + "pre sen", + "r ò", + "pa ro", + "tu a", + "v in", + "an e", + "a s", + "ste sso", + "da v", + "ne i", + "nel le", + "gh i", + "pi o", + "ta r", + "an a", + "la to", + "si d", + "f ine", + "f uo", + "m er", + "z o", + "qua si", + "ul ti", + "i to", + "su e", + "si e", + "f il", + "allo ra", + "m in", + "ven i", + "t ano", + "el lo", + "d e", + "r a", + "e s", + "d o", + "e n", + "q u", + "c o", + "a s", + "o s", + "e r", + "a r", + "s e", + "qu e", + "a n", + "i n", + "i s", + "t o", + "ã o", + "t e", + "d a", + "m a", + "e l", + "t a", + "o r", + "i a", + "r e", + "e m", + "a l", + "co m", + "p a", + "o u", + "c a", + "u m", + "r o", + "v a", + "t i", + "s o", + "m en", + "n ão", + "h a", + "co n", + "m e", + "r i", + "pa ra", + "p o", + "d i", + "s a", + "v o", + "u ma", + "c i", + "n a", + "p or", + "n o", + "g u", + "s u", + "h o", + "an do", + "t ra", + "e i", + "v i", + "e u", + "i m", + "do s", + "el e", + "r es", + "m o", + "en t", + "f i", + "l a", + "e ra", + "l e", + "de s", + "el a", + "men te", + "l h", + "p er", + "l i", + "ç ão", + "m as", + "t er", + "m u", + "es t", + "v e", + "g o", + "l o", + "u s", + "ma is", + "v er", + "c ê", + "in ha", + "vo cê", + "f a", + "t u", + "c u", + "p ar", + "com o", + "p ro", + "s i", + "m os", + "e c", + "p re", + "d as", + "ç a", + "es ta", + "s er", + "u n", + "da de", + "d is", + "f o", + "e x", + "c h", + "i r", + "ra n", + "t ar", + "en te", + "g a", + "t r", + "p e", + "t os", + "b o", + "c ia", + "p en", + "c ar", + "s en", + "su a", + "se m", + "c as", + "f or", + "to u", + "n os", + "te m", + "r ia", + "m es", + "se u", + "co r", + "o n", + "a o", + "p os", + "ra m", + "v el", + "é m", + "t en", + "po de", + "t es", + "esta va", + "c e", + "b a", + "qu ando", + "m i", + "qu er", + "men to", + "se gu", + "t as", + "is so", + "mu i", + "g ar", + "t ro", + "d u", + "fa z", + "õ es", + "p es", + "an to", + "l u", + "p i", + "i x", + "ve z", + "s im", + "j a", + "p r", + "m in", + "b e", + "ra s", + "m an", + "p res", + "est á", + "c er", + "b re", + "p as", + "d ia", + "m b", + "dis se", + "n i", + "r os", + "es se", + "v ia", + "o lh", + "is a", + "an te", + "ê n", + "z a", + "qu i", + "b i", + "t inha", + "me u", + "s ão", + "m inha", + "a c", + "ri o", + "m ar", + "a t", + "p el", + "mui to", + "ta l", + "to r", + "fo i", + "h or", + "j o", + "b em", + "g i", + "f al", + "vo l", + "po n", + "di z", + "l ar", + "gu n", + "m or", + "r u", + "par ec", + "ç o", + "do r", + "pes so", + "n e", + "f er", + "b er", + "p u", + "po is", + "in a", + "es p", + "d ar", + "en do", + "de n", + "so bre", + "co s", + "p ri", + "al i", + "mes mo", + "ç ões", + "g ra", + "se us", + "me i", + "b ra", + "vi da", + "an tes", + "b ri", + "at é", + "ên cia", + "lh e", + "ti v", + "m ã", + "al g", + "qu anto", + "s ó", + "g os", + "de r", + "t ão", + "tu do", + "ent ão", + "r ou", + "es s", + "in da", + "b al", + "in do", + "ci o", + "n do", + "j á", + "va m", + "re i", + "l es", + "ei to", + "v is", + "tem po", + "de pois", + "c ha", + "m el", + "ch e", + "l ha", + "a inda", + "faz er", + "con tra", + "p ou", + "per gun", + "de ix", + "ta mb", + "ra r", + "al a", + "v en", + "t in", + "pel o", + "tamb ém", + "fi ca", + "pre c", + "el es", + "tra n", + "ha via", + "l á", + "to dos", + "j u", + "qu al", + "c an", + "ta do", + "cas a", + "es sa", + "n as", + "g em", + "m em", + "se i", + "na da", + "sen ti", + "c ri", + "ó s", + "de u", + "ei ro", + ". .", + "f un", + "as sim", + "s ou", + "ent re", + "com e", + "i or", + "h ar", + "f e", + "por que", + "s or", + "f in", + "ta mente", + "a qui", + "cu l", + "t ó", + "for ma", + "s ar", + "ou tra", + "olh os", + "i ma", + "m im", + "a go", + "in s", + "co u", + "g ran", + "v al", + "pesso as", + "era m", + "ei ra", + "a que", + "com p", + "de i", + "p ela", + "co isa", + "m ão", + "con h", + "ca da", + "ago ra", + "ia m", + "h á", + "con s", + "su as", + "gu ém", + "o b", + "l an", + "es ti", + "á s", + "la do", + "in ter", + "ca be", + "por ta", + "n em", + "í vel", + "r is", + "j e", + "n un", + "sem pre", + "con segu", + "h as", + "tra bal", + "f u", + "le v", + "l em", + "l as", + "va i", + "tr os", + "t ante", + "te i", + "pr ó", + "que m", + "tu ra", + "on de", + "cabe ça", + "nun ca", + "men tos", + "h um", + "de le", + "ver dade", + "t á", + "h os", + "el i", + "ent es", + "m er", + "alg um", + "diz er", + "s in", + "pen as", + "n ós", + "en quanto", + "ou tro", + "l ho", + "es te", + "mel hor", + "est ar", + "g an", + "b ar", + "pri mei", + "a u", + "i u", + "pen sa", + "a penas", + "p ra", + "es tou", + "con te", + "res pon", + "ho mem", + "do is", + "a do", + "c al", + "a b", + "l os", + "ç as", + "pou co", + "sen hor", + "t ando", + "esp era", + "pa i", + "ri os", + "no i", + "i da", + "ba ix", + "as e", + "is as", + "f r", + "ho ra", + "mu ndo", + "pas sa", + "fi car", + "to do", + "se ja", + "al mente", + "â n", + "c lar", + "a d", + "in c", + "f os", + "lo n", + "g ri", + "ou vi", + "v em", + "g e", + "ta va", + "á rio", + "mo n", + "s os", + "in ho", + "ma l", + "t an", + "t re", + "gran de", + "ran do", + "b u", + "v ou", + "ê s", + "co isas", + "a conte", + "lh er", + "g en", + "ci on", + "an os", + "i do", + "tal vez", + "est ão", + "li v", + "sa b", + "su r", + "ou tros", + "c re", + "qual quer", + "g ou", + "t ri", + "l í", + "tiv esse", + "ra do", + "prec isa", + "mã e", + "su s", + "t anto", + "de la", + "men os", + "s al", + "en tra", + "p é", + "ma ior", + "noi te", + "ti va", + "p ala", + "so n", + "ra ção", + "de us", + "s as", + "un i", + "l or", + "u l", + "in te", + "f ei", + "an o", + "par ti", + "pala v", + "tr ás", + "par te", + "b el", + "ci dade", + "lu gar", + "v os", + "vez es", + "do u", + "en contra", + "tr u", + "e ci", + "a r", + "e r", + "a n", + "e n", + "i n", + "i r", + "o r", + "d e", + "a k", + "ı n", + "a l", + "d i", + "d a", + "b u", + "b ir", + "y or", + "i l", + "e k", + "y a", + "m a", + "l a", + "e l", + "u n", + "k a", + "l ar", + "i m", + "d ı", + "e t", + "o n", + "d u", + "o l", + "e y", + "t ı", + "m i", + "h a", + "b a", + "l er", + "ü n", + "m ı", + "i z", + "l e", + "ı r", + "m e", + "i s", + "n e", + "o k", + "t a", + "s a", + "u m", + "r a", + "g ö", + "i k", + "s ı", + "d en", + "e s", + "b il", + "t i", + "l ı", + "ü z", + "i ç", + "ü r", + "g i", + "u r", + "t e", + "b en", + "d an", + "i y", + "ı m", + "u z", + "v e", + "c ak", + "a y", + "c e", + "i ş", + "ın ı", + "i yor", + "ba ş", + "d ü", + "a t", + "a m", + "g el", + "de ğ", + "k ar", + "i ̇", + "m u", + "e v", + "ö y", + "bu n", + "v ar", + "ya p", + "s en", + "an a", + "s un", + "in i", + "gö r", + "y ı", + "k i", + "l i", + "ar a", + "al ı", + "on u", + "ç ı", + "ş ey", + "s ın", + "k ı", + "ka d", + "s e", + "t an", + "a ğ", + "değ il", + "s in", + "ü k", + "a z", + "ç ok", + "s on", + "ş ı", + "b i", + "ü l", + "t u", + "v er", + "iç in", + "g e", + "k en", + "ey e", + "ol du", + "mı ş", + "y e", + "k al", + "m ek", + "l an", + "öy le", + "yor du", + "er i", + "y üz", + "mi ş", + "b e", + "m ak", + "o la", + "in e", + "y an", + "h er", + "c ek", + "yor um", + "b ak", + "ü m", + "ö n", + "lar ı", + "o ğ", + "d er", + "kad ar", + "h al", + "ar ı", + "s t", + "s an", + "ın da", + "du r", + "g ün", + "v a", + "y ok", + "y er", + "dı m", + "k o", + "da ha", + "l u", + "ın a", + "di m", + "e m", + "bil ir", + "ik i", + "s iz", + "s i", + "n a", + "di ğ", + "s u", + "b ü", + "ha y", + "s or", + "dü ş", + "ü ç", + "un u", + "ö r", + "d ir", + "m ü", + "c a", + "am an", + "f ak", + "a da", + "e de", + "son ra", + "h iç", + "ak i", + "ğ ı", + "bu l", + "r u", + "ma z", + "an la", + "bu ra", + "ge ç", + "ma ya", + "l en", + "k onu", + "c i", + "c u", + "d in", + "t ek", + "z aman", + "el er", + "ö z", + "dı r", + "gi bi", + "o t", + "ş a", + "g er", + "ler i", + "k im", + "k u", + "fak at", + "y ar", + "gö z", + "c ı", + "yor sun", + "b ek", + "in de", + "r o", + "p ek", + "bun u", + "l ik", + "m an", + "il er", + "e di", + "ö l", + "s ür", + "b in", + "s ır", + "çı k", + "sı l", + "al ar", + "k es", + "y ak", + "ç ek", + "yı l", + "e cek", + "ı z", + "gi t", + "ka p", + "a ma", + "ı l", + "lar ın", + "b iz", + "tı r", + "o y", + "an cak", + "d oğ", + "ç a", + "b ana", + "ş im", + "baş la", + "l ü", + "ma dı", + "ben i", + "t ir", + "y ük", + "lı k", + "be ş", + "b el", + "b er", + "m er", + "na sıl", + "tı k", + "k e", + "t ür", + "a v", + ". .", + "d aki", + "p ar", + "t er", + "ce ğ", + "t en", + "z ı", + "iy i", + "d ok", + "ben im", + "c ağ", + "n er", + "y en", + "ş u", + "me z", + "düş ün", + "ken di", + "şim di", + "y ol", + "y u", + "de v", + "is te", + "s ek", + "ma m", + "s öyle", + "di k", + "t o", + "k ur", + "oldu ğ", + "s ını", + "t ar", + "bil iyor", + "k an", + "y al", + "m eye", + "mu ş", + "f a", + "ka ç", + "bil e", + "iy e", + "t ü", + "e f", + "tı m", + "ev et", + "ç o", + "y et", + "g en", + "bura da", + "t im", + "bir az", + "es i", + "k or", + "doğ ru", + "in in", + "kı z", + "di ye", + "d ör", + "et ti", + "on un", + "is ti", + "ğ i", + "h e", + "s ana", + "ü ş", + "ar ka", + "hay ır", + "kar şı", + "h ar", + "il e", + "h ak", + "ı yor", + "ne den", + "s ev", + "sı z", + "ço cu", + "me m", + "ç alı", + "ol ur", + "b ır", + "g ir", + "is e", + "i h", + "c an", + "k ır", + "d ön", + "b öyle", + "sen i", + "! \"", + "al t", + "dör t", + "s öy", + "o ş", + "mu sun", + "la ş", + "h an", + "i p", + "ka y", + "h em", + "bü yük", + "a ç", + "bır ak", + "mi sin", + "s öz", + "u l", + "değ iş", + "ün ü", + "g ül", + "k ö", + "kar ı", + "ta mam", + "ol u", + "r ar", + "yen i", + "la m", + "mış tı", + "ya ş", + "al a", + "in iz", + "kad ın", + "bun un", + "m ey", + "al tı", + "y i", + "s o", + "in den", + "sen in", + "ya t", + "to p", + "s er", + "is i", + "d ün", + "s es", + "hiç bir", + "y on", + "d ın", + "t ün", + "baş ka", + "a s", + "he p", + "i t", + "ir mi", + "dev am", + "ola cak", + "ar tık", + "r e", + "dur um", + "im iz", + "üz el", + "ler ini", + "sa ğ", + "p ro", + "ger ek", + "y irmi", + "ş ek", + "ba ğ", + "me di", + "lar a", + "a h", + "t ur", + "y ür", + "ma sı", + "ka tı", + "de di", + "g ü", + "sor un", + "el i", + "ün e", + "mı z", + "yap ı", + "m il", + "ğ ını", + "t ara", + "m en", + "ha t", + "var dı", + "m et", + "konu ş", + "ar ak", + "lar ak", + "çocu k", + "bü tün", + "l ey", + "d ür", + "g üzel", + "ay ı", + "yap a", + "n ı", + "ay r", + "ö ne", + "yordu m", + "b an", + "i̇ ş", + "du m", + "un a", + "on a", + "yor lar", + "lar ını", + "çı kar", + "z an", + "se ç", + "l iyor", + "t ak", + "şı k", + "tek rar", + "a ş", + "e ş", + "miş ti", + "f ar", + "k in", + "im i", + "i f", + "e ğ", + "gi di", + "le ş", + "başla dı", + "gi de", + "ot ur", + "d de", + "ın dan", + "üz er", + "ın ın", + "n ız", + "u y", + "ye di", + "ka t", + "o larak", + "la dı", + "yal nız", + "ba h", + "iy et", + "m al", + "s ak", + "a çık", + "sın da", + ".. .", + "in san", + "ay nı", + "e der", + "is tan", + "uz un", + "sa h", + "d o", + "g eri", + "er ek", + "ol an", + "ger çek", + "f en", + "al an", + "dı ş", + "alı k", + "far k", + "ü st", + "sa de", + "r i", + "k iş", + "l dı", + "z or", + "et ir", + "her kes", + "s al", + "ö mer", + "s el", + "un da", + "ha f", + "bun a", + "y dı", + "pek i", + "ada m", + "ha z", + "sın a", + "kap ı", + "gör üş", + "sade ce", + "al dı", + "gel di", + "i e", + "n ie", + "n a", + "r z", + "s z", + "c z", + "p o", + "s t", + "c h", + "i ę", + "d z", + "n i", + "a ł", + "r a", + "j e", + "r o", + "d o", + "s ię", + "z a", + "g o", + "e m", + "w i", + "c i", + "rz e", + "k o", + "l e", + "l i", + "w a", + "t o", + "k a", + "m i", + "ż e", + "t a", + "w ie", + "b y", + "m o", + "w y", + "rz y", + "ł a", + "j a", + "n o", + "ł o", + "w o", + "p a", + "m a", + "t e", + "t y", + "n y", + "k i", + "d a", + "n e", + "dz ie", + "dz i", + "cz y", + "c ie", + "m y", + "p rze", + "d y", + "o d", + "l a", + "k ie", + "r y", + "st a", + "j ą", + "ó w", + "c e", + "p rzy", + "c o", + "k u", + "m ie", + "sz y", + "cz e", + "r e", + "b a", + "s i", + "b ie", + "m u", + "w e", + "c y", + "ni a", + "ś ci", + "sz e", + "je st", + "k t", + "s a", + "b o", + "t u", + "ż y", + "n ą", + "b i", + "r u", + "a le", + "kt ó", + "p ra", + "ał a", + "m nie", + "p ie", + "ł y", + "cz a", + "ja k", + "ro z", + "r ó", + "l u", + "z na", + "g a", + "ra z", + "ł u", + "ta k", + "j u", + "p i", + "ś ć", + "s o", + "wi a", + "m ó", + "ch o", + "w szy", + "p e", + "s po", + "c a", + "g dy", + "w ał", + "w ię", + "d e", + "b e", + "p ro", + "ł em", + "j ę", + "s k", + "z e", + "l o", + "g i", + "r ę", + "do b", + "d u", + "ju ż", + "st o", + "b ę", + "ał em", + "sz a", + "m e", + "po d", + "d la", + "pa n", + "n ę", + "z o", + "mo że", + "ś li", + "s ie", + "ał o", + "t em", + "l ko", + "ny ch", + "po wie", + "c ię", + "s u", + "ty lko", + "i n", + "b u", + "na j", + "ch a", + "te go", + "p u", + "s ki", + "ne go", + "wszy st", + "sz cze", + "je d", + "je j", + "t wo", + "ą d", + "ś my", + "cz ę", + "wa ć", + "je go", + "ż a", + "i m", + "s y", + "pra w", + "ty m", + "któ ry", + "ał y", + "t rze", + "nie j", + "s e", + "ny m", + "i ch", + "o b", + ". .", + "g ło", + "ją c", + "mó wi", + "s ka", + "o n", + "ne j", + "s łu", + "w ła", + "bę dzie", + "d ę", + "p ó", + "be z", + "ni c", + "p ła", + "ś cie", + "mi a", + "s ą", + "t rzy", + "kie m", + "by ł", + "mo g", + "ro bi", + "ta m", + "c u", + "te n", + "m ię", + "z y", + "pe w", + "ci a", + "my ś", + "prze d", + "s ko", + "n u", + "któ re", + "a l", + "l ę", + "w sze", + "ą c", + "by ło", + "so bie", + "p y", + "ci ą", + "ba r", + "je szcze", + "h a", + "t ę", + "b ra", + "cza s", + "sz ę", + "g ł", + "k ę", + "ma r", + "cz u", + "prze z", + "f i", + "s ło", + "w z", + "k to", + "k ów", + "cz o", + "li śmy", + "st ra", + "wię c", + "r ą", + "ma m", + "w ó", + "rz a", + "g ro", + "no ści", + "f a", + "we t", + "ną ł", + "ś mie", + "na wet", + "mu si", + "s wo", + "te j", + "w ą", + "w u", + "wi ą", + "ni u", + "cz ą", + "b li", + "dz o", + "s kie", + "n em", + "je śli", + "cze go", + "ch y", + "d ł", + "ty ch", + "by m", + "ż o", + "e ś", + "si ą", + "kie dy", + "na s", + "w ró", + "dz e", + "d ro", + "t ra", + "r ów", + "pa ni", + "z ie", + "ku l", + "na d", + "ch wi", + "ni m", + "t ro", + "by ć", + "cho dzi", + "ni o", + "dob rze", + "te raz", + "wo kul", + "co ś", + "k ł", + "pie r", + "h e", + "g dzie", + "dz y", + "p ię", + "d ź", + "k ą", + "g ó", + "z da", + "ch ce", + "st ę", + "o r", + "ś wia", + "wszyst ko", + "st ro", + "pe ł", + "wie m", + "wie l", + "ka ż", + "ki m", + "rz u", + "s ły", + "jed na", + "z u", + "myś l", + "mó j", + "g u", + "wa r", + "jest em", + "ó ż", + "mie j", + "mo ż", + "k ła", + "re sz", + "d łu", + "st wo", + "n ię", + "ma sz", + "że by", + "nie m", + "ja kie", + "st y", + "ni ą", + "we j", + "o j", + "g ra", + "s ła", + "no ść", + "z ło", + "sz czę", + ".. .", + "r i", + "le j", + "we go", + "c ał", + "dzi ał", + "ki ch", + "dz a", + "dz ię", + "o czy", + "zo sta", + "cz ło", + "na m", + "ki l", + "o na", + "sz u", + "w ę", + "pa r", + "mi ał", + "st rze", + "ce j", + "e j", + "zna j", + "da ć", + "miej s", + "k ró", + "k ry", + "bar dzo", + "si a", + "z i", + "ś nie", + "l ą", + "g ie", + "cie bie", + "d ni", + "st u", + "po trze", + "wokul ski", + "u wa", + "u mie", + "jedna k", + "k ra", + "wró ci", + "czło wie", + "czy ć", + "by ła", + "że li", + "m ę", + "c ę", + "z robi", + "mog ę", + "pro wa", + "r em", + "nie ch", + "cz nie", + "k ro", + "t ą", + "ch ci", + "b ro", + "dzie ć", + "sz ą", + "pa d", + "t rz", + "t ru", + "je m", + "a ni", + "t ów", + "a r", + "d ru", + "ta j", + "rze kł", + "sa m", + "st e", + "nie go", + "ta kie", + "w ała", + "to wa", + "ka pła", + "wi dzi", + "po dob", + "dz ę", + "t ał", + "stę p", + "b ą", + "po ko", + "w em", + "g ę", + "a by", + "g e", + "al bo", + "s pra", + "z no", + "de n", + "s mo", + "je sz", + "k się", + "jest eś", + "po z", + "ni gdy", + "k sią", + "c óż", + "w s", + "po w", + "t ka", + "ś wie", + "sz ka", + "sa mo", + "s ł", + "rz ę", + "na le", + "chce sz", + "ni k", + "p ę", + "chy ba", + "cią g", + "ją cy", + "wo j", + "na sze", + "mnie j", + "wię cej", + "z wy", + "o sta", + "f e", + "wa ż", + "h o", + "se r", + "śmie r", + "wie r", + "dz ą", + "za ś", + "gdy by", + "ja ki", + "wo l", + "wi n", + "d ą", + "ści a", + "roz ma", + "wa l", + "pa nie", + "sta r", + "ka z", + "je żeli", + "d em", + "w ra", + "ko ń", + "sie bie", + "zno wu", + "p ró", + "cz em", + "st wa", + "i sto", + "pó ł", + "d ał", + "ko bie", + "ała m", + "wy ch", + "ce sa", + "ni ch", + "za wsze", + "dzi ć", + "te ż", + "le pie", + "pro szę", + "k re", + "t wa", + "o t", + "ł ą", + "ch u", + "c ą", + "p rz", + "ł e", + "sze dł", + "od powie", + "my śli", + "ś wią", + "e n", + "e r", + "d e", + "a n", + "e t", + "i j", + "i n", + "e l", + "a a", + "s t", + "o r", + "g e", + "i s", + "a t", + "i e", + "c h", + "o n", + "e en", + "h et", + "i t", + "v er", + "aa r", + "a l", + "o or", + "g en", + "v an", + "o p", + "d en", + "h e", + "o m", + "t e", + "w e", + "i k", + "r e", + "z e", + "ij n", + "d at", + "b e", + "d er", + "in g", + "o e", + "ij k", + "a an", + "ch t", + "v oor", + "l e", + "i et", + "r o", + "m o", + "k en", + "z ijn", + "m en", + "i g", + "j e", + "n iet", + "a r", + "o o", + "i d", + "u n", + "i l", + "s ch", + "mo et", + "st e", + "u r", + "o l", + "he b", + "u it", + "g el", + "w ij", + "a s", + "m e", + "t en", + "w or", + "o u", + "v en", + "l en", + "aa t", + "d it", + "m et", + "r a", + "b en", + "s p", + "o ver", + "d ie", + "n o", + "w er", + "l ijk", + "f t", + "s l", + "an d", + "v e", + "t er", + "i er", + "i en", + "t o", + "d aar", + "g r", + "b el", + "de ze", + "d u", + "a g", + "k an", + "wor den", + "in gen", + "moet en", + "n en", + "on der", + "heb ben", + "r u", + "oo k", + "s en", + "c t", + "k t", + "no g", + "aa l", + "w as", + "u l", + "e er", + "b ij", + "m ijn", + "p ro", + "v ol", + "d o", + "k om", + "at ie", + "e ft", + "k el", + "al s", + "r ij", + "he id", + "a f", + "st el", + "m aar", + "a p", + "we e", + "a d", + "he eft", + "w aar", + "i cht", + "d an", + "er en", + "n e", + "w el", + "w at", + "w il", + "a cht", + "aa g", + "ge b", + "c on", + "z o", + "k e", + "b et", + "h ij", + "d ig", + "k un", + "u w", + "d t", + "d oor", + "t ij", + "a m", + "an g", + "on d", + "er s", + "is ch", + "ge en", + "i ge", + "ge v", + "ve el", + "n u", + "m a", + "on s", + "o f", + "b l", + "n aar", + "g ro", + "p l", + "an der", + "at en", + "kun nen", + "e cht", + "h ier", + "g oe", + "an t", + "u s", + "t wee", + "on t", + "de lijk", + "el e", + "u ur", + "al le", + "t oe", + "me er", + "i st", + "n a", + "n ie", + "on ze", + "l o", + "i m", + "p en", + "h ad", + "tij d", + "h oe", + "to t", + "z ou", + "a k", + "aa k", + "a men", + "d r", + "w oor", + "s e", + "wor dt", + "o t", + "gel ijk", + "g aan", + "i c", + "g er", + "k er", + "el d", + "e m", + "h ou", + "de l", + "z en", + "z el", + "te gen", + "b o", + "kom en", + "c om", + "i gen", + "e it", + "wer k", + "goe d", + "z al", + "z ij", + "sl ag", + "e s", + "z ien", + "a st", + "echt er", + "it ie", + "t ie", + "el ijk", + "m is", + "isch e", + "bel an", + "h aar", + "i ch", + "b er", + "h an", + "v r", + "al e", + "c i", + "gr ijk", + "in d", + "do en", + "l and", + "belan grijk", + "p un", + "op en", + "ct ie", + "zel f", + "m ij", + "it eit", + "ste m", + "me e", + "ar en", + "al l", + "b r", + "re cht", + "d ien", + "h u", + "g aat", + "pro b", + "m oe", + "p er", + "a u", + "ul len", + "z ich", + "daar om", + "or m", + "k l", + "v o", + "en t", + "st aat", + "z it", + "du i", + "n at", + "du s", + "d s", + "ver slag", + "kel ijk", + "prob le", + "w et", + "ge m", + "c r", + "i on", + "p r", + "sch ap", + "g d", + "h un", + "z a", + "er d", + "z et", + "st aan", + "st r", + "m aal", + "in der", + "e id", + "st en", + "p ar", + "k ken", + "ge d", + "z ullen", + "re s", + "men sen", + "j aar", + "re gel", + "ie der", + "vol gen", + "ge ven", + "e ven", + "l u", + "bl ij", + "i ë", + "k o", + "u we", + "m an", + "ma ken", + "l ie", + "g a", + "oe k", + "nie uwe", + "b aar", + "h o", + "h er", + "in ter", + "ander e", + "ru ik", + "s u", + "a gen", + "or t", + "m er", + "ou w", + "st er", + "wil len", + "aa kt", + "h oo", + "an den", + "f f", + "l ig", + "t re", + "s amen", + "ze er", + "dui delijk", + "ant woor", + "he el", + "men t", + "pun t", + "hou den", + "we g", + "vr aag", + "gel e", + "een s", + "be sch", + "om en", + "er g", + "do el", + "d ag", + "sp e", + "ur en", + "ing s", + "or en", + "l ang", + "de len", + "m ar", + "ste un", + "in nen", + "p ol", + "o on", + "i de", + "s n", + "s ie", + "r icht", + "z onder", + "no dig", + "all een", + "m id", + "ra gen", + "iet s", + "ver sch", + "geb ruik", + "st u", + "ro uw", + "stel len", + "be g", + "men ten", + "v in", + "eer ste", + "l aat", + "gro ot", + "oo d", + "to ch", + "l aten", + "aar d", + "s le", + "de el", + "st and", + "pl aat", + "re e", + "bet re", + "d i", + "l id", + "uit en", + "ra cht", + "bel eid", + "g et", + "ar t", + "st ie", + "st aten", + "g gen", + "re ken", + "e in", + "al en", + "m ing", + "mo gelijk", + "gro te", + "al tijd", + "z or", + "en kel", + "w ik", + "pol itie", + "e igen", + "el k", + "han del", + "g t", + "k we", + "m aat", + "el en", + "i p", + "v rij", + "s om", + "je s", + "aa m", + "hu is", + "v al", + "we er", + "lid staten", + "k ing", + "k le", + "be d", + "gev al", + "stel l", + "a i", + "wik kel", + "kwe stie", + "t al", + "ste e", + "a b", + "h el", + "kom st", + "p as", + "s s", + "it u", + "i den", + "eer d", + "m in", + "c e", + "p o", + "twee de", + "proble em", + "w aren", + "us sen", + "sn el", + "t ig", + "ge w", + "j u", + "ul t", + "ne men", + "com mis", + "versch il", + "k on", + "z oek", + "k rij", + "gr aag", + "den k", + "l anden", + "re den", + "be sl", + "oe g", + "bet er", + "he den", + "m ag", + "p e", + "bo ven", + "a c", + "con t", + "f d", + "h ele", + "k r", + "v ier", + "w in", + "ge z", + "k w", + "m il", + "v or", + "he m", + "ra m", + "aa s", + "ont wikkel", + "dr ie", + "v aak", + "plaat s", + "l a", + "g ang", + "ij f", + "f in", + "nat uur", + "t ussen", + "u g", + "in e", + "d a", + "b at", + "kom t", + "w acht", + "aa d", + "u t", + "é n", + "acht er", + "geb ie", + "ver k", + "lig t", + "c es", + "nie uw", + "van d", + "s t", + "n í", + "j e", + "p o", + "c h", + "r o", + "n a", + "s e", + "t o", + "n e", + "l e", + "k o", + "l a", + "d o", + "r a", + "n o", + "t e", + "h o", + "n ě", + "v a", + "l i", + "l o", + "ř e", + "c e", + "d e", + "v e", + "b y", + "n i", + "s k", + "t a", + "n á", + "z a", + "p ro", + "v o", + "v ě", + "m e", + "v á", + "s o", + "k a", + "r á", + "v y", + "z e", + "m i", + "p a", + "t i", + "st a", + "m ě", + "n é", + "ř i", + "ř í", + "m o", + "ž e", + "m a", + "j í", + "v ý", + "j i", + "d ě", + "r e", + "d a", + "k u", + "j a", + "c i", + "r u", + "č e", + "o b", + "t ě", + "m u", + "k y", + "d i", + "š e", + "k é", + "š í", + "t u", + "v i", + "p ře", + "v í", + "s i", + "n ý", + "o d", + "so u", + "v é", + "n y", + "r i", + "d y", + "b u", + "b o", + "t y", + "l á", + "l u", + "n u", + "ž i", + "m á", + "st i", + "c í", + "z á", + "p ra", + "sk é", + "m í", + "c o", + "d u", + "d á", + "by l", + "st o", + "s a", + "t í", + "je d", + "p ří", + "p ři", + "t é", + "s í", + "č i", + "v ní", + "č a", + "d í", + "z i", + "st u", + "p e", + "b a", + "d ní", + "ro z", + "va l", + "l í", + "s po", + "k á", + "b e", + "p i", + "no u", + "ta k", + "st e", + "r y", + "l é", + "vě t", + "se m", + "p ě", + "ko n", + "ne j", + "l y", + "ko u", + "ý ch", + "b ě", + "p r", + "f i", + "p rá", + "a le", + "ja ko", + "po d", + "ž í", + "z í", + "j sou", + "j sem", + "ch o", + "l ní", + "c ké", + "t á", + "m y", + "a k", + "h u", + "va t", + "pře d", + "h la", + "k e", + "st á", + "č í", + "š i", + "s le", + "k la", + "š tě", + "lo u", + "m ů", + "z na", + "ch á", + "o r", + "p ů", + "h a", + "b i", + "ta ké", + "d ů", + "no st", + "t ře", + "te r", + "p u", + "i n", + "v r", + "ve l", + "sk u", + "v še", + "t ní", + "do b", + "by la", + "č ní", + "ja k", + "v u", + "je ho", + "b ý", + "vá ní", + "ný ch", + "po u", + "te n", + "t ři", + "v z", + "st ře", + "d va", + "h le", + "č á", + "no sti", + "c k", + "v š", + "vo u", + "s u", + "h e", + "h ra", + "je n", + "s y", + "da l", + "po z", + "s lo", + "te l", + "d ru", + "de n", + "vš ak", + "g i", + "k dy", + "by lo", + "bu de", + "st ra", + "j ší", + "m é", + "me n", + "vý ch", + "ní m", + "s m", + "ko li", + "r ů", + "t ra", + "mů že", + "ne ní", + "ho d", + "b í", + "do u", + "sk a", + "t ý", + "st ě", + "u je", + "s á", + "pě t", + "ne s", + "k rá", + "to m", + "st ví", + "v ně", + "se d", + "s vé", + "p í", + "z o", + "mu sí", + "u ž", + "tí m", + "jí cí", + "jed no", + "t r", + "ča s", + "e v", + "č ty", + "sk ý", + "ni c", + "ev ro", + "to ho", + "h y", + "k ter", + "r ní", + "st í", + "s vě", + "pa k", + "vše ch", + "k ů", + "n g", + "á d", + "chá zí", + "a ni", + "a r", + "jed na", + "bý t", + "t ro", + "k ra", + "pr vní", + "m no", + "ské ho", + "p á", + "p la", + "le m", + "ne bo", + "ke m", + "st ro", + "s la", + "né ho", + "z de", + "dal ší", + "ř a", + "čty ři", + "h rá", + "dru h", + "l ně", + "v la", + "sk ých", + "š ko", + "pů so", + "pro to", + "v ů", + "sk á", + "ve n", + "še st", + "d ně", + "je ště", + "me zi", + "te k", + "s ko", + "ch a", + "ně koli", + "be z", + "g ra", + "ji ž", + "č ně", + "j á", + "s lu", + "z ná", + "ve r", + "sed m", + "k ro", + "ta m", + "a no", + "v lá", + "o sm", + "byl y", + "vá m", + "ck ý", + "te ch", + "dě ji", + "vel mi", + "le ži", + "va la", + "l ý", + "t vo", + "spo le", + "ch u", + "stu p", + "mo ž", + "evro p", + "g e", + "sta l", + "j de", + "ch y", + "ro di", + "je jí", + "po li", + "de vět", + "s me", + "a ž", + "té to", + "re m", + "d é", + "f or", + "u ni", + "f o", + "ten to", + "a u", + "ka ž", + "nu la", + "na d", + "by ch", + "mo c", + "sto u", + "e x", + "le n", + "k do", + "z d", + "pra co", + "to mu", + "ný m", + "ži vo", + "ze m", + "f e", + "f u", + "ná sle", + "j o", + "sk y", + "ji ch", + "h á", + "mě l", + "dě la", + "j sme", + "p re", + "ni ce", + "ste j", + "ne m", + "st ní", + "he m", + "ná ro", + "z u", + "b li", + "ni t", + "pa r", + "a l", + "poz ději", + "ta ko", + "n ce", + "če r", + "ší m", + "ně co", + "vá l", + "ře j", + "krá t", + "á lní", + "u r", + ". .", + "a si", + "kter é", + "sta v", + "ma jí", + "my s", + "do bě", + "s ně", + "ce n", + "z y", + "z ku", + "t ů", + "ch od", + "s pě", + "je jich", + "sou čas", + "d r", + "va li", + "ri e", + "k te", + "pr ů", + "ze ní", + "pa t", + "a n", + "po tře", + "de m", + "d nes", + "ze mí", + "sa mo", + "zna m", + "b ra", + "má m", + "te dy", + "g o", + "hla vní", + "pou ží", + "b ní", + "ve de", + "le p", + "je k", + "pra v", + "poli ti", + "d ne", + "je m", + "le t", + "če ní", + "pro b", + "ne ž", + "dě l", + "fi l", + "č o", + "cí ch", + "st é", + "d lou", + "h i", + "a by", + "to u", + "několi k", + "d la", + "vy u", + "vi t", + "ho u", + "ck ých", + "no vé", + "či n", + "st y", + "dě lá", + "k ý", + "ob la", + "pod le", + "ra n", + "dů leži", + "ta to", + "po ku", + "ko ne", + "d ý", + "d vě", + "ž ád", + "nou t", + "t ku", + "t vr", + "cké ho", + "ro v", + "r é", + "te le", + "p sa", + "s vět", + "ti vní", + "do sta", + "te m", + "še l", + "druh é", + "s kou", + "ž o", + "jed ná", + "vý znam", + "prob lé", + "pu bli", + "vá n", + "od po", + "pod po", + "d le", + "ja ké", + "še ní", + "ví m", + "bě hem", + "na chází", + "s lou", + "pou ze", + "o tá", + "p lo", + "to vé", + "vět ši", + "ko mi", + "va jí", + "ty to", + "zá pa", + "z mě", + "mo h", + "ví ce", + "spole č", + "au to", + "pro ti", + "st ru", + "dě t", + "chá ze", + "že l", + "с т", + "е н", + "н о", + "н а", + "п р", + "т о", + "п о", + "р а", + "г о", + "к о", + "н е", + "в о", + "в а", + "е т", + "е р", + "н и", + "е л", + "и т", + "н ы", + "з а", + "р о", + "ен и", + "к а", + "л и", + "е м", + "д а", + "о б", + "л а", + "д о", + "с я", + "т ь", + "о т", + "л о", + "л ь", + "е д", + "с о", + "м и", + "р е", + "м о", + "ц и", + "пр о", + "т а", + "э то", + "к и", + "р у", + "пр и", + "т и", + "с е", + "ст а", + "в ы", + "м ы", + "в и", + "б ы", + "м а", + "е с", + "л я", + "ст и", + "л е", + "ч то", + "м е", + "р и", + "ч а", + "о д", + "е й", + "ел ь", + "ени я", + "г а", + "н у", + "с и", + "п а", + "ра з", + "б о", + "ст о", + "с у", + "с а", + "д у", + "е го", + "е ст", + "и н", + "ит ь", + "и з", + "ж е", + "м у", + "п ер", + "по д", + "ени е", + "с ь", + "к у", + "пр ед", + "но го", + "ны х", + "в ер", + "т е", + "но й", + "ци и", + "д е", + "р ы", + "д ел", + "л ю", + "в е", + "о н", + "м ен", + "г и", + "н я", + "б у", + "пр а", + "в се", + "ет ся", + "ст ь", + "ж а", + "до л", + "ж и", + "б е", + "ко н", + "с л", + "ш и", + "д и", + "ст в", + "с ко", + "ны е", + "ч и", + "ю т", + "д ер", + "ст ра", + "т ы", + "х од", + "щ и", + "з о", + "з на", + "но сти", + "ч ес", + "в ля", + "ва ть", + "о р", + "по л", + "в ет", + "та к", + "ш а", + "т у", + "с во", + "пр е", + "о на", + "ит ель", + "ны й", + "с ло", + "ка к", + "в л", + "но сть", + "х о", + "мо ж", + "п е", + "д ля", + "ни я", + "но е", + "ра с", + "дол ж", + "да р", + "т ель", + "с ка", + "п у", + "ст во", + "ко то", + "ра б", + "е е", + "ро д", + "э ти", + "с об", + "о ру", + "ж ен", + "ны м", + "ит и", + "ни е", + "ко м", + "д ет", + "ст у", + "г у", + "п и", + "ме ж", + "ени ю", + "т ер", + "раб от", + "во з", + "ци я", + "ко й", + "щ ест", + "г ра", + "з и", + "р я", + "меж ду", + "ст ва", + "в с", + "ел о", + "ш е", + "м ер", + "б а", + "з ы", + "л у", + "а ль", + "д ей", + "г ла", + "на род", + "к ти", + "пред ста", + "л ся", + "я вля", + "с ки", + "но в", + "ед ин", + "ро в", + "и с", + "ни ма", + "р ем", + "ход и", + "так же", + "д ру", + "а ть", + "сл ед", + "го во", + "на я", + "ю щи", + "ен ь", + "кото ры", + "х от", + "в у", + "и х", + "ем у", + "ч ит", + "ва ж", + "ор га", + "чес ки", + "щ е", + "к е", + "х а", + "по с", + "то м", + "бо ль", + "м не", + "па с", + "об ъ", + "пра в", + "кон ф", + "сл у", + "под дер", + "ст ви", + "на ш", + "ль ко", + "сто я", + "ну ю", + "л ем", + "ен ных", + "к ра", + "д ы", + "между народ", + "г да", + "не об", + "го су", + "ств у", + "ени и", + "госу дар", + "к то", + "и м", + "ч ест", + "р ет", + "во про", + "л ен", + "ел и", + "ро ва", + "ци й", + "на м", + "это й", + "ж ения", + "необ ходи", + "мен я", + "бы ло", + "си ли", + "ф и", + "в я", + "ш ь", + "это го", + "о ни", + "орга ни", + "бе зо", + "пр об", + "и ме", + "ре ш", + "б и", + "безо пас", + "ют ся", + "о ста", + "ен но", + "го д", + "ел а", + "предста в", + "ть ся", + "сло во", + "органи за", + "долж ны", + "это м", + "б ла", + "ч е", + "ч у", + "бла го", + "это му", + "в рем", + "с пе", + "но м", + "ени й", + "с по", + "на с", + "не т", + "з у", + "в ед", + "е ще", + "ска за", + "се й", + "ер ен", + "да н", + "са м", + "ел я", + "ра н", + "зы ва", + "явля ется", + "бу дет", + "кти в", + "т ре", + "дел е", + "м от", + "конф ерен", + "ла сь", + "ча с", + "сто ро", + "ко го", + "е з", + "не й", + "о с", + "ли сь", + "раз ору", + "пер е", + "с си", + "ны ми", + "про ц", + "го ло", + "ч ело", + "бо ле", + "чело ве", + "с ер", + "п л", + "ч ет", + "стра н", + "п я", + "бы л", + "к ла", + "то в", + "ж д", + "дел а", + "е ра", + "у же", + "со вет", + "г ен", + "безопас ности", + "ц а", + "се да", + "по з", + "от вет", + "проб лем", + "на ко", + "т ем", + "до ста", + "п ы", + "щ а", + "во й", + "су щест", + "необходи мо", + "бы ть", + "мож ет", + "д ем", + "что бы", + "е к", + "ч ер", + "у сили", + "ре с", + "ру д", + "един енных", + "д об", + "до сти", + "ств ен", + "я дер", + "год ня", + "ка за", + "се годня", + "сей час", + "то лько", + "во д", + "ес ь", + "м ного", + "бу ду", + "е в", + "ест ь", + "т ри", + "об щест", + ". .", + "я вл", + "вы сту", + "р ед", + "с чит", + "с ит", + "деле га", + "ло ж", + "это т", + "ф ор", + "к лю", + "воз мож", + "ва ния", + "б ли", + "и ли", + "в з", + "на ций", + "ско го", + "при ня", + "п ла", + "о ч", + "ить ся", + "ст е", + "на ши", + "которы е", + "а р", + "име ет", + "с от", + "зна ч", + "пер ь", + "след у", + "ен ы", + "та ки", + "объ единенных", + "ст ро", + "те перь", + "б ле", + "благо дар", + "раз в", + "а н", + "жи ва", + "оч ень", + "я т", + "бе з", + "об ес", + "г ро", + "ло сь", + "с ы", + "организа ции", + "ч лен", + "то го", + "она ль", + "ж да", + "все х", + "с вя", + "боле е", + "со в", + "ко гда", + "во т", + "к ре", + "к ры", + "по этому", + "во ль", + "о й", + "ген ера", + "ч ем", + "л ы", + "пол ити", + "в ен", + "конферен ции", + "проц ес", + "б я", + "ит е", + "от но", + "разв ити", + "а ф", + "ю щ", + "в но", + "ми р", + "ни и", + "ка я", + "а с", + "итель но", + "в то", + "ени ем", + "генера ль", + "пр от", + "вс ем", + "сам бле", + "ас самбле", + "о м", + "з д", + "с мот", + "ре ги", + "ч его", + "од нако", + "усили я", + "дей стви", + "ч но", + "у ча", + "об раз", + "во с", + "э та", + "пер его", + "гово р", + "ва м", + "мо ло", + "врем я", + "д ь", + "хот ел", + "г ру", + "за явл", + "пре доста", + "по ль", + "не е", + "ре зо", + "перего во", + "резо лю", + "к рет", + "поддер ж", + "обес пе", + "не го", + "представ ит", + "на де", + "к ри", + "ч ь", + "про ек", + "л ет", + "дру ги", + "ا ل", + "َ ا", + "و َ", + "ّ َ", + "ِ ي", + "أ َ", + "ل َ", + "ن َ", + "ال ْ", + "ه ُ", + "ُ و", + "م ا", + "ن ْ", + "م ن", + "ع َ", + "ن ا", + "ل ا", + "م َ", + "ت َ", + "ف َ", + "أ ن", + "ل ي", + "م ِ", + "ا ن", + "ف ي", + "ر َ", + "ي َ", + "ه ِ", + "م ْ", + "ق َ", + "ب ِ", + "ل ى", + "ي ن", + "إ ِ", + "ل ِ", + "و ا", + "ك َ", + "ه ا", + "ً ا", + "م ُ", + "و ن", + "ال م", + "ب َ", + "ي ا", + "ذ ا", + "س ا", + "ال ل", + "م ي", + "ي ْ", + "ر ا", + "ر ي", + "ل ك", + "م َا", + "ن َّ", + "ل م", + "إ ن", + "س ت", + "و م", + "ّ َا", + "ل َا", + "ه م", + "ّ ِ", + "ك ُ", + "ك ان", + "س َ", + "ب ا", + "د ي", + "ح َ", + "ع ْ", + "ب ي", + "ال أ", + "و ل", + "ف ِي", + "ر ِ", + "د ا", + "مِ نْ", + "ُو نَ", + "و ْ", + "ه َا", + "ّ ُ", + "ال س", + "ال َ", + "ن ي", + "ل ْ", + "ت ُ", + "ه ل", + "ر ة", + "د َ", + "س ْ", + "ت ِ", + "ن َا", + "ر ْ", + "الل َّ", + "سا مي", + "ك ن", + "ك ل", + "ه َ", + "عَ لَ", + "ع لى", + "م ع", + "إ لى", + "ق د", + "ال ر", + "ُو ا", + "ي ر", + "ع ن", + "ي ُ", + "ن ِ", + "ب ْ", + "ال ح", + "هُ مْ", + "ق ا", + "ذ ه", + "ال ت", + "ِي نَ", + "ج َ", + "ه ذا", + "ع د", + "ال ع", + "د ْ", + "قَ الَ", + "ر ُ", + "ي م", + "ي ة", + "ن ُ", + "خ َ", + "ر ب", + "ال ك", + "و َا", + "أ نا", + "ة ِ", + "ال ن", + "ح د", + "ع ِ", + "ت ا", + "ه و", + "ف ا", + "ع ا", + "ال ش", + "ل ُ", + "ي ت", + "ذ َا", + "ي ع", + "ال ذ", + "ح ْ", + "ال ص", + "إِ نَّ", + "ج ا", + "ع لي", + "ك َا", + "ب ُ", + "ت ع", + "و ق", + "م ل", + "ل َّ", + "ي د", + "أ خ", + "ر ف", + "ت ي", + "ال ِ", + "ّ ا", + "ذ لك", + "أَ نْ", + "س ِ", + "ت وم", + "م ر", + "مَ نْ", + "ب ل", + "ال ق", + "الل ه", + "ِي َ", + "ك م", + "ذ َ", + "ع ل", + "ح ب", + "س ي", + "ع ُ", + "ال ج", + "ال د", + "ش َ", + "ت ك", + "ف ْ", + "ص َ", + "ل ل", + "د ِ", + "ب ر", + "ف ِ", + "ت ه", + "أ ع", + "ت ْ", + "ق ْ", + "الْ أَ", + "ئ ِ", + "عَ نْ", + "و ر", + "ح ا", + "ال َّ", + "م ت", + "ف ر", + "د ُ", + "ه نا", + "وَ أَ", + "ت ب", + "ة ُ", + "أ ي", + "س ب", + "ري د", + "و ج", + "كُ مْ", + "ح ِ", + "ك ْ", + "د ر", + "َا ء", + "ه ذه", + "ال ط", + "الْ مُ", + "د ة", + "ق ل", + "غ َ", + "ي وم", + "الَّ ذ", + "ك ر", + "ت ر", + "ك ِ", + "ك ي", + "عَلَ ى", + "رَ ب", + "ع ة", + "ق ُ", + "ج ْ", + "ف ض", + "ل ة", + "ه ْ", + "ر َا", + "وَ لَ", + "الْ مَ", + "أَ نَّ", + "ي َا", + "أ ُ", + "ش ي", + "اللَّ هُ", + "لَ ى", + "ق ِ", + "أ ت", + "عَلَ يْ", + "اللَّ هِ", + "ال ب", + "ض َ", + "ة ً", + "ق ي", + "ا ر", + "ب د", + "خ ْ", + "سْ تَ", + "ط َ", + "قَ دْ", + "ذه ب", + "أ م", + "ما ذا", + "وَ إِ", + "ة ٌ", + "و نَ", + "لي لى", + "و لا", + "ح ُ", + "ه ي", + "ص ل", + "ال خ", + "و د", + "لي س", + "ل دي", + "ق ال", + "كَا نَ", + "م َّ", + "ح ي", + "ت م", + "ل ن", + "وَ لَا", + "ب ع", + "يم كن", + "س ُ", + "ة َ", + "ح ت", + "ر ًا", + "ك ا", + "ش ا", + "هِ مْ", + "لَ هُ", + "ز َ", + "دا ً", + "م س", + "ك ث", + "الْ عَ", + "ج ِ", + "ص ْ", + "ف َا", + "ل ه", + "و ي", + "ع َا", + "هُ وَ", + "ب ِي", + "ب َا", + "أ س", + "ث َ", + "ل ِي", + "ر ض", + "الر َّ", + "لِ كَ", + "ت َّ", + "ف ُ", + "ق ة", + "ف عل", + "مِ ن", + "ال آ", + "ث ُ", + "س م", + "م َّا", + "بِ هِ", + "ت ق", + "خ ر", + "ل قد", + "خ ل", + "ش ر", + "أن ت", + "ل َّا", + "س ن", + "الس َّ", + "الذ ي", + "س َا", + "و ما", + "ز ل", + "و ب", + "أ ْ", + "إ ذا", + "ر ِي", + "ح ة", + "ن ِي", + "الْ حَ", + "وَ قَالَ", + "ب ه", + "ة ٍ", + "س أ", + "ر ٌ", + "ب ال", + "م ة", + "ش ْ", + "و ت", + "عن د", + "ف س", + "بَ عْ", + "ه ر", + "ق ط", + "أ ح", + "إن ه", + "و ع", + "ف ت", + "غ ا", + "هنا ك", + "ب ت", + "مِ نَ", + "س ر", + "ذَ لِكَ", + "ر س", + "حد ث", + "غ ْ", + "ّ ِي", + "ال إ", + "وَ يَ", + "ج ل", + "ا ست", + "ق ِي", + "ع ب", + "و س", + "ي ش", + "الَّذ ِينَ", + "تا ب", + "د ِي", + "ج ب", + "ك ون", + "ب ن", + "ال ث", + "لَ يْ", + "ب عد", + "وَ الْ", + "فَ أَ", + "ع م", + "هُ م", + "ت ن", + "ذ ْ", + "أ ص", + "أ ين", + "رَب ِّ", + "الذ ين", + "إِ ن", + "ب ين", + "ج ُ", + "عَلَيْ هِ", + "ح َا", + "ل و", + "ست ط", + "ظ ر", + "لَ مْ", + "ء ِ", + "كُ ل", + "ط ل", + "ت َا", + "ض ُ", + "كن ت", + "ل ًا", + "م ٌ", + "ق بل", + "ـ ـ", + "ذ ِ", + "قَ وْ", + "ص ِ", + "م ًا", + "كان ت", + "ص ا", + "ي ق", + "ال ف", + "ال نا", + "م ٍ", + "إِ نْ", + "ال نَّ", + "ج د", + "وَ مَا", + "ت ت", + "ب ح", + "م كان", + "كي ف", + "ّ ة", + "ال ا", + "ج َا", + "أ و", + "سا عد", + "ض ِ", + "إ لا", + "را ً", + "ق َا", + "ر أ", + "ع ت", + "أ حد", + "ه د", + "ض ا", + "ط ر", + "أ ق", + "ما ء", + "د َّ", + "ال با", + "م ُو", + "أَ وْ", + "ط ا", + "ق ُو", + "خ ِ", + "ت ل", + "ستط يع", + "د َا", + "الن َّا", + "إ لَى", + "وَ تَ", + "هَ ذَا", + "ب ة", + "علي ك", + "ج ر", + "ال من", + "ز ا", + "ر ٍ", + "د ع", + "ّ ًا", + "س ة", + "ثُ مَّ", + "شي ء", + "ال غ", + "ت ح", + "ر ُونَ", + "ال يوم", + "م ِي", + "ن ُوا", + "أ ر", + "تُ مْ", + "ع ر", + "ي ف", + "أ ب", + "د ًا", + "ص َا", + "الت َّ", + "أ ريد", + "ال ز", + "يَ وْ", + "إ لي", + "ج ي", + "يَ عْ", + "فض ل", + "ال إن", + "أن ه", + "n g", + "i 4", + "a n", + "s h", + "z h", + "i 2", + "ng 1", + "u 4", + "i 1", + "ng 2", + "d e", + "j i", + "a o", + "x i", + "u 3", + "de 5", + "e 4", + "i 3", + "ng 4", + "an 4", + "e n", + "u o", + "sh i4", + "an 2", + "u 2", + "c h", + "u 1", + "ng 3", + "a 1", + "an 1", + "e 2", + "a 4", + "e i4", + "o ng1", + "a i4", + "ao 4", + "h u", + "a ng1", + "l i", + "y o", + "an 3", + "w ei4", + "uo 2", + "n 1", + "en 2", + "ao 3", + "e 1", + "y u", + "q i", + "e ng2", + "zh o", + "a ng3", + "a ng4", + "a ng2", + "uo 4", + "m i", + "g e4", + "y i1", + "g uo2", + "e r", + "b i", + "a 3", + "h e2", + "e 3", + "y i2", + "d i4", + "zh ong1", + "b u4", + "g u", + "a i2", + "n 2", + "z ai4", + "sh i2", + "e ng1", + "r en2", + "o ng2", + "xi an4", + "y i", + "x u", + "n 4", + "l i4", + "en 4", + "y u2", + "e i2", + "yi2 ge4", + "o u4", + "e i3", + "d i", + "u i4", + "a 2", + "yo u3", + "ao 1", + "d a4", + "ch eng2", + "en 1", + "e ng4", + "y i4", + "s i1", + "zh i4", + "ji a1", + "yu an2", + "n i", + "t a1", + "de5 yi2ge4", + "k e1", + "sh u3", + "x i1", + "j i2", + "ao 2", + "t i", + "o u3", + "o ng4", + "xi a4", + "a i1", + "g ong1", + "zh i1", + "en 3", + "w ei2", + "j u", + "xu e2", + "q u1", + "zho u1", + "er 3", + "mi ng2", + "zho ng3", + "l i3", + "w u4", + "y i3", + "uo 1", + "e 5", + "j i4", + "xi ng2", + "ji an4", + "hu a4", + "y u3", + "uo 3", + "j i1", + "a i3", + "z uo4", + "h ou4", + "hu i4", + "e i1", + "ni an2", + "q i2", + "p i", + "d ao4", + "sh eng1", + "de 2", + "d ai4", + "u an2", + "zh e4", + "zh eng4", + "b en3", + "sh ang4", + "zh u3", + "b ei4", + "y e4", + "ch u1", + "zh an4", + "l e5", + "l ai2", + "sh i3", + "n an2", + "r en4", + "yo u2", + "k e4", + "b a1", + "f u4", + "d ui4", + "y a4", + "m ei3", + "z i4", + "xi n1", + "ji ng1", + "zh u", + "n 3", + "yo ng4", + "m u4", + "ji ao4", + "y e3", + "ji n4", + "bi an4", + "l u4", + "q i1", + "sh e4", + "xi ang1", + "o ng3", + "sh u4", + "d ong4", + "s uo3", + "gu an1", + "s an1", + "b o", + "t e4", + "d uo1", + "f u2", + "mi n2", + "l a1", + "zh i2", + "zh en4", + "o u1", + "w u3", + "m a3", + "i 5", + "z i5", + "j u4", + "er 4", + "y ao4", + "xia4 de5yi2ge4", + "s i4", + "t u2", + "sh an1", + "z ui4", + "ch u", + "yi n1", + "er 2", + "t ong2", + "d ong1", + "y u4", + "y an2", + "qi an2", + "shu3 xia4de5yi2ge4", + "ju n1", + "k e3", + "w en2", + "f a3", + "l uo2", + "zh u4", + "x i4", + "k ou3", + "b ei3", + "ji an1", + "f a1", + "di an4", + "ji ang1", + "wei4 yu2", + "xi ang4", + "zh i3", + "e ng3", + "f ang1", + "l an2", + "sh u", + "r i4", + "li an2", + "sh ou3", + "m o", + "qi u2", + "ji n1", + "h uo4", + "shu3xia4de5yi2ge4 zhong3", + "f en1", + "n ei4", + "g ai1", + "mei3 guo2", + "u n2", + "g e2", + "b ao3", + "qi ng1", + "g ao1", + "t ai2", + "d u", + "xi ao3", + "ji e2", + "ti an1", + "ch ang2", + "q uan2", + "li e4", + "h ai3", + "f ei1", + "t i3", + "ju e2", + "o u2", + "c i3", + "z u2", + "n i2", + "bi ao3", + "zhong1 guo2", + "d u4", + "yu e4", + "xi ng4", + "sh eng4", + "ch e1", + "d an1", + "ji e1", + "li n2", + "pi ng2", + "f u3", + "g u3", + "ji e4", + "w o", + "v 3", + "sh eng3", + "n a4", + "yu an4", + "zh ang3", + "gu an3", + "d ao3", + "z u3", + "di ng4", + "di an3", + "c eng2", + "ren2 kou3", + "t ai4", + "t ong1", + "g uo4", + "n eng2", + "ch ang3", + "hu a2", + "li u2", + "yi ng1", + "xi ao4", + "c i4", + "bian4 hua4", + "li ang3", + "g ong4", + "zho ng4", + "de5 yi1", + "s e4", + "k ai1", + "w ang2", + "ji u4", + "sh i1", + "sh ou4", + "m ei2", + "k u", + "s u", + "f eng1", + "z e2", + "tu2 shi4", + "t i2", + "q i4", + "ji u3", + "sh en1", + "zh e3", + "ren2kou3 bian4hua4", + "ren2kou3bian4hua4 tu2shi4", + "di4 qu1", + "y ang2", + "m en", + "men 5", + "l ong2", + "bi ng4", + "ch an3", + "zh u1", + "w ei3", + "w ai4", + "xi ng1", + "bo 1", + "b i3", + "t ang2", + "hu a1", + "bo 2", + "shu i3", + "sh u1", + "d ou1", + "s ai4", + "ch ao2", + "b i4", + "li ng2", + "l ei4", + "da4 xue2", + "f en4", + "shu3 de5", + "m u3", + "ji ao1", + "d ang1", + "ch eng1", + "t ong3", + "n v3", + "q i3", + "y an3", + "mi an4", + "l uo4", + "ji ng4", + "g e1", + "r u4", + "d an4", + "ri4 ben3", + "p u3", + "yu n4", + "hu ang2", + "wo 3", + "l v", + "h ai2", + "shi4 yi1", + "xi e1", + "yi ng3", + "w u2", + "sh en2", + "w ang3", + "gu ang3", + "li u4", + "s u4", + "shi4 zhen4", + "c an1", + "c ao3", + "xi a2", + "k a3", + "d a2", + "h u4", + "b an4", + "d ang3", + "h u2", + "z ong3", + "de ng3", + "de5yi2ge4 shi4zhen4", + "ch uan2", + "mo 4", + "zh ang1", + "b an1", + "mo 2", + "ch a2", + "c e4", + "zhu3 yao4", + "t ou2", + "j u2", + "shi4 wei4yu2", + "s a4", + "u n1", + "ke3 yi3", + "d u1", + "h an4", + "li ang4", + "sh a1", + "ji a3", + "z i1", + "lv 4", + "f u1", + "xi an1", + "x u4", + "gu ang1", + "m eng2", + "b ao4", + "yo u4", + "r ong2", + "zhi1 yi1", + "w ei1", + "m ao2", + "guo2 jia1", + "c ong2", + "g ou4", + "ti e3", + "zh en1", + "d u2", + "bi an1", + "c i2", + "q u3", + "f an4", + "xi ang3", + "m en2", + "j u1", + "h ong2", + "z i3", + "ta1 men5", + "ji 3", + "z ong1", + "zhou1 de5yi2ge4shi4zhen4", + "t uan2", + "ji ng3", + "gong1 si1", + "xi e4", + "l i2", + "li4 shi3", + "b ao1", + "g ang3", + "gu i1", + "zh eng1", + "zhi2 wu4", + "ta1 de5", + "pi n3", + "zhu an1", + "ch ong2", + "shi3 yong4", + "w a3", + "sh uo1", + "chu an1", + "l ei2", + "w an1", + "h uo2", + "q u", + "s u1", + "z ao3", + "g ai3", + "q u4", + "g u4", + "l u", + "x i2", + "h ang2", + "yi ng4", + "c un1", + "g en1", + "yi ng2", + "ti ng2", + "cheng2 shi4", + "ji ang3", + "li ng3", + "l un2", + "bu4 fen4", + "de ng1", + "xu an3", + "dong4 wu4", + "de2 guo2", + "xi an3", + "f an3", + "zh e5", + "h an2", + "h ao4", + "m i4", + "r an2", + "qi n1", + "ti ao2", + "zh an3", + "h i", + "k a", + "n o", + "t e", + "s u", + "s hi", + "t a", + "t o", + "n a", + "w a", + "o u", + "r u", + "n i", + "k u", + "k i", + "g a", + "d e", + "k o", + "m a", + "r e", + "r a", + "m o", + "t su", + "w o", + "e n", + "r i", + "s a", + "d a", + "s e", + "j i", + "h a", + "c hi", + "k e", + "te ki", + "m i", + "y ou", + "s h", + "s o", + "y o", + "y a", + "na i", + "t te", + "a ru", + "b a", + "u u", + "t ta", + "ka i", + "ka n", + "shi te", + "m e", + "d o", + "mo no", + "se i", + "r o", + "ko to", + "ka ra", + "shi ta", + "b u", + "m u", + "c h", + "su ru", + "k ou", + "g o", + "ma su", + "ta i", + "f u", + "k en", + "i u", + "g en", + "wa re", + "shi n", + "z u", + "a i", + "o n", + "o ku", + "g i", + "d ou", + "n e", + "y uu", + "i ru", + "i te", + "ji ko", + "de su", + "j u", + "ra re", + "sh u", + "b e", + "sh ou", + "s ha", + "se kai", + "s ou", + "k you", + "ma shita", + "s en", + "na ra", + "sa n", + "ke i", + "i ta", + "a ri", + "i tsu", + "ko no", + "j ou", + "na ka", + "ch ou", + "so re", + "g u", + "na ru", + "ga ku", + "re ba", + "g e", + "h o", + "i n", + "hi to", + "sa i", + "na n", + "da i", + "tsu ku", + "shi ki", + "sa re", + "na ku", + "p p", + "bu n", + "ju n", + "so no", + "ka ku", + "z ai", + "b i", + "to u", + "wa ta", + "sh uu", + "i i", + "te i", + "ka re", + "y u", + "shi i", + "ma de", + "sh o", + "a n", + "ke reba", + "shi ka", + "i chi", + "ha n", + "de ki", + "ni n", + "ware ware", + "na kereba", + "o ite", + "h ou", + "ya ku", + "ra i", + "mu jun", + "l e", + "yo ku", + "bu tsu", + "o o", + "ko n", + "o mo", + "ga e", + "nara nai", + "ta chi", + "z en", + "ch uu", + "kan gae", + "ta ra", + "to ki", + "ko ro", + "mujun teki", + "z e", + "na ga", + "ji n", + "shi ma", + "te n", + "i ki", + "i ku", + "no u", + "i masu", + "r ou", + "h on", + "ka e", + "t to", + "ko re", + "ta n", + "ki ta", + "i s", + "da tta", + "ji tsu", + "ma e", + "i e", + "me i", + "da n", + "h e", + "to ku", + "dou itsu", + "ri tsu", + "k yuu", + "h you", + "rare ta", + "kei sei", + "k kan", + "rare ru", + "m ou", + "do ko", + "r you", + "da ke", + "naka tta", + "so ko", + "ta be", + "e r", + "ha na", + "c o", + "fu ku", + "p a", + "so n", + "ya su", + "ch o", + "wata ku", + "ya ma", + "z a", + "k yo", + "gen zai", + "b oku", + "a ta", + "j a", + "ka wa", + "ma sen", + "j uu", + "ro n", + "b o", + "na tte", + "wataku shi", + "yo tte", + "ma i", + "g ou", + "ha i", + "mo n", + "ba n", + "ji shin", + "c a", + "re te", + "n en", + "o ka", + "ka gaku", + "na tta", + "p o", + "ka ru", + "na ri", + "m en", + "ma ta", + "e i", + "ku ru", + "ga i", + "ka ri", + "sha kai", + "kou i", + "yo ri", + "se tsu", + "j o", + "re ru", + "to koro", + "ju tsu", + "i on", + "sa ku", + "tta i", + "c ha", + "nin gen", + "n u", + "c e", + "ta me", + "kan kyou", + "de n", + "o oku", + "i ma", + "wata shi", + "tsuku ru", + "su gi", + "b en", + "ji bun", + "shi tsu", + "ke ru", + "ki n", + "ki shi", + "shika shi", + "mo to", + "ma ri", + "i tte", + "de shita", + "n de", + "ari masu", + "te r", + "z ou", + "ko e", + "ze ttai", + "kkan teki", + "h en", + "re kishi", + "deki ru", + "tsu ka", + "l a", + "i tta", + "o i", + "ko butsu", + "mi ru", + "sh oku", + "shi masu", + "gi jutsu", + "g you", + "jou shiki", + "a tta", + "ho do", + "ko ko", + "tsuku rareta", + "z oku", + "hi tei", + "ko ku", + "rekishi teki", + "ke te", + "o ri", + "i mi", + "ka ko", + "naga ra", + "ka karu", + "shu tai", + "ha ji", + "ma n", + "ta ku", + "ra n", + "douitsu teki", + "z o", + "me te", + "re i", + "tsu u", + "sare te", + "gen jitsu", + "p e", + "s t", + "ba i", + "na wa", + "ji kan", + "wa ru", + "r t", + "a tsu", + "so ku", + "koui teki", + "a ra", + "u ma", + "a no", + "i de", + "ka ta", + "te tsu", + "ga wa", + "ke do", + "re ta", + "mi n", + "sa you", + "tte ru", + "to ri", + "p u", + "ki mi", + "b ou", + "mu ra", + "sare ru", + "ma chi", + "k ya", + "o sa", + "kon na", + "a ku", + "a l", + "sare ta", + "i pp", + "shi ku", + "u chi", + "hito tsu", + "ha tara", + "tachi ba", + "shi ro", + "ka tachi", + "to mo", + "e te", + "me ru", + "ni chi", + "da re", + "ka tta", + "e ru", + "su ki", + "a ge", + "oo ki", + "ma ru", + "mo ku", + "o ko", + "kangae rareru", + "o to", + "tan ni", + "ta da", + "tai teki", + "mo tte", + "ki nou", + "shi nai", + "k ki", + "u e", + "ta ri", + "l i", + "ra nai", + "k kou", + "mi rai", + "pp on", + "go to", + "hi n", + "hi tsu", + "te ru", + "mo chi", + "ka tsu", + "re n", + "n yuu", + "su i", + "zu ka", + "tsu ite", + "no mi", + "su gu", + "ku da", + "tetsu gaku", + "i ka", + "ron ri", + "o ki", + "ni ppon", + "p er", + "shi mashita", + "chi shiki", + "cho kkanteki", + "su ko", + "t ion", + "ku u", + "a na", + "a rou", + "ka tte", + "ku ri", + "i nai", + "hyou gen", + "i shiki", + "do ku", + "a tte", + "a tara", + "to n", + "wa ri", + "ka o", + "sei san", + "hana shi", + "s i", + "ka ke", + "na ji", + "su nawa", + "sunawa chi", + "u go", + "su u", + "ba ra", + "le v", + "hi ro", + "i wa", + "be tsu", + "yo i", + "se ru", + "shite ru", + "rare te", + "to shi", + "se ki", + "tai ritsu", + "wa kara", + "to kyo", + "k ka", + "k yoku", + "u n", + "i ro", + "mi te", + "sa ki", + "kan ji", + "mi ta", + "su be", + "r yoku", + "ma tta", + "kuda sai", + "omo i", + "ta no", + "ware ru", + "co m", + "hitsu you", + "ka shi", + "re nai", + "kan kei", + "a to", + "ga tte", + "o chi", + "mo tsu", + "in g", + "son zai", + "l l", + "o re", + "tai shite", + "a me", + "sei mei", + "ka no", + "gi ri", + "kangae ru", + "yu e", + "a sa", + "o naji", + "yo ru", + "ni ku", + "osa ka", + "suko shi", + "c k", + "ta ma", + "kano jo", + "ki te", + "mon dai", + "a mari", + "e ki", + "ko jin", + "ha ya", + "i t", + "de te", + "atara shii", + "a wa", + "ga kkou", + "tsu zu", + "shu kan", + "i mashita", + "mi na", + "ata e", + "da rou", + "hatara ku", + "ga ta", + "da chi", + "ma tsu", + "ari masen", + "sei butsu", + "mi tsu", + "he ya", + "yasu i", + "d i", + "de ni", + "no ko", + "ha ha", + "do mo", + "ka mi", + "su deni", + "na o", + "ra ku", + "i ke", + "a ki", + "me ta", + "l o", + "ko domo", + "so shite", + "ga me", + "ba kari", + "to te", + "ha tsu", + "mi se", + "moku teki", + "da kara", + "s z", + "e l", + "g y", + "e n", + "t t", + "e m", + "a n", + "a k", + "e r", + "a z", + "a l", + "e t", + "o l", + "e g", + "e k", + "m i", + "o n", + "é s", + "c s", + "a t", + "á r", + "h o", + "e z", + "á l", + "i s", + "á n", + "o r", + "a r", + "e gy", + "e s", + "é r", + "á t", + "o tt", + "e tt", + "m eg", + "t a", + "o k", + "o s", + "ho gy", + "n em", + "é g", + "n y", + "k i", + "é l", + "h a", + "á s", + "ü l", + "i n", + "mi n", + "n a", + "e d", + "o m", + "i k", + "k ö", + "m a", + "n i", + "v a", + "v ol", + "é t", + "b b", + "f el", + "i g", + "l e", + "r a", + "é n", + "t e", + "d e", + "a d", + "ó l", + "b e", + "on d", + "j a", + "r e", + "u l", + "b en", + "n ek", + "u t", + "vol t", + "b an", + "ö r", + "o g", + "a p", + "o d", + "á g", + "n k", + "é k", + "v al", + "k or", + "a m", + "i l", + "í t", + "á k", + "b a", + "u d", + "sz er", + "min d", + "o z", + "é p", + "el l", + "ér t", + "m ond", + "i t", + "sz t", + "n ak", + "a mi", + "n e", + "ő l", + "cs ak", + "n é", + "ma g", + "ol y", + "m er", + "ál l", + "án y", + "ö n", + "ö l", + "min t", + "m ár", + "ö tt", + "na gy", + "é sz", + "az t", + "el ő", + "t ud", + "o t", + "é ny", + "á z", + "m ég", + "kö z", + "el y", + "s ég", + "en t", + "s em", + "ta m", + "h et", + "h al", + "f i", + "a s", + "v an", + "ho z", + "v e", + "u k", + "k ez", + "á m", + "v el", + "b er", + "a j", + "u nk", + "i z", + "va gy", + "m os", + "sz em", + "em ber", + "f og", + "mer t", + "ü k", + "l en", + "ö s", + "e j", + "t al", + "h at", + "t ak", + "h i", + "m ás", + "s ág", + "ett e", + "l eg", + "ü nk", + "h át", + "sz a", + "on y", + "ez t", + "mind en", + "en d", + "ül t", + "h an", + "j ó", + "k is", + "á j", + "in t", + "ú gy", + "i d", + "mos t", + "ar t", + "í r", + "k er", + "i tt", + "a tt", + "el t", + "mond ta", + "k ell", + "l á", + "ak i", + "ál t", + "ér d", + "t ö", + "l an", + "v ár", + "h ol", + "t el", + "l át", + "ő k", + "v et", + "s e", + "ut án", + "k ét", + "na p", + "í v", + "ál y", + "v ég", + "ö k", + "i r", + "d ul", + "v is", + "né z", + "t er", + "á ban", + "k ül", + "ak kor", + "k ap", + "sz él", + "y en", + "ú j", + "i m", + "oly an", + "es en", + "k ed", + "h ely", + "t ör", + "b ól", + "el m", + "r á", + "ár a", + "r ó", + "l ó", + "vol na", + "t an", + "le het", + "e bb", + "t en", + "t ek", + "s ok", + "k al", + "f or", + "u g", + "ol t", + "k a", + "ek et", + "b or", + "f ej", + "g ond", + "a g", + "ak ar", + "f él", + "ú l", + "b el", + "ott a", + "mi t", + "val ami", + "j el", + "é d", + "ar c", + "u r", + "hal l", + "t i", + "f öl", + "á ba", + "ol g", + "ki r", + "ol d", + "m ar", + "k érd", + "j ár", + "ú r", + "sz e", + "z s", + "él et", + "j át", + "o v", + "u s", + "é z", + "v il", + "v er", + "ő r", + "á d", + "ö g", + "le sz", + "on t", + "b iz", + "k oz", + "á bb", + "kir ály", + "es t", + "a b", + "en g", + "ig az", + "b ar", + "ha j", + "d i", + "o b", + "k od", + "r ól", + "v ez", + "tö bb", + "sz ó", + "é ben", + "ö t", + "ny i", + "t á", + "sz ól", + "gond ol", + "eg ész", + "í gy", + "ő s", + "o bb", + "os an", + "b ől", + "a bb", + "c i", + "ő t", + "n ál", + "k ép", + "azt án", + "v i", + "t art", + "be szél", + "m en", + "elő tt", + "a szt", + "ma j", + "kö r", + "han g", + "í z", + "in cs", + "a i", + "é v", + "ó d", + "ó k", + "hoz z", + "t em", + "ok at", + "an y", + "nagy on", + "h áz", + "p er", + "p ed", + "ez te", + "et len", + "nek i", + "maj d", + "sz ony", + "án ak", + "fel é", + "egy szer", + "j e", + "ad t", + "gy er", + "ami kor", + "f oly", + "sz ak", + "ő d", + "h ú", + "á sz", + "am ely", + "h ar", + "ér e", + "il yen", + "od a", + "j ák", + "t ár", + "á val", + "l ak", + "t ó", + "m ent", + "gy an", + "él y", + "ú t", + "v ar", + "kez d", + "m ell", + "mi kor", + "h ez", + "val ó", + "k o", + "m es", + "szer et", + "r end", + "l et", + "vis sza", + "ig en", + "f ő", + "va s", + "as szony", + "r ől", + "ped ig", + "p i", + "sz ép", + "t ák", + "ö v", + "an i", + "vil ág", + "p en", + "mag a", + "t et", + "sz ik", + "é j", + "én t", + "j ött", + "s an", + "sz í", + "i de", + "g at", + "ett em", + "ul t", + "h ány", + "ás t", + "a hol", + "ők et", + "h ár", + "k el", + "n ő", + "cs i", + "tal ál", + "el te", + "lá tt", + "tör t", + "ha gy", + "e sz", + "s en", + "n él", + "p ar", + "v ál", + "k ut", + "l ány", + "ami t", + "s ő", + "ell en", + "mag át", + "in k", + "u gyan", + "kül ön", + "a sz", + "mind ig", + "l ép", + "tal án", + "u n", + "sz or", + "k e", + "il lan", + "n incs", + "z et", + "vagy ok", + "tel en", + "is mer", + "s or", + "is ten", + "ít ott", + "j obb", + "v es", + "dul t", + "j uk", + "sz en", + "r o", + "ö m", + "l ett", + "k ar", + "egy ik", + "b ár", + "sz i", + "sz ív", + "az on", + "e szt", + "föl d", + "kut y", + "p illan", + "f ér", + "k om", + "t ől", + "t ű", + "é be", + "t ött", + "bar át", + "í g", + "a hogy", + "e h", + "e p", + "s o", + "v en", + "jel ent", + "t at", + "sz eg", + "mint ha", + "f al", + "egy en", + "mi l", + "sza b", + "r i", + "é m", + "biz ony", + "j on", + "ör eg", + "d olg", + "cs ap", + "ti szt", + "áll t", + "an cs", + "id ő", + "k at", + "ü gy", + "mi ért", + "ó t", + "ü r", + "cs in", + "h az", + "b et", + "én ek", + "v ér", + "j ól", + "al att", + "m ely", + "l o", + "sem mi", + "ny ug", + "v ág", + "kö vet", + "ös sze", + "ma d", + "l i", + "a cs", + "fi ú", + "kö n", + "más ik", + "j ön", + "sz ám", + "g er", + "s ó", + "r ész", + "k ér", + "z el", + "é vel", + "e o", + "e u", + "a n", + "eu l", + "eu n", + "eo n", + "a e", + "d a", + "a l", + "s s", + "i n", + "i l", + "a g", + "an g", + "y eon", + "y eo", + "d o", + "c h", + "n g", + "j i", + "h an", + "g a", + "g o", + "u i", + "h ae", + "a m", + "u l", + "u n", + "g eo", + "s i", + "n eun", + "ss da", + "s eo", + "eon g", + "y o", + "i da", + "t t", + "k k", + "j eo", + "d eul", + "w a", + "eu m", + "g e", + "o n", + "o g", + "s al", + "m an", + "yeon g", + "geo s", + "h ag", + "an eun", + "j a", + "g i", + "s u", + "i ss", + "o l", + "d ae", + "eo b", + "h a", + "j u", + "eo l", + "g eu", + "j eong", + "s ae", + "do e", + "g eul", + "s eu", + "s in", + "eul o", + "b n", + "s ang", + "bn ida", + "h al", + "b o", + "han eun", + "m al", + "i m", + "m o", + "b u", + "jeo g", + "sae ng", + "in eun", + "an h", + "m a", + "sal am", + "j o", + "s a", + "eo m", + "n ae", + "w i", + "l o", + "g wa", + "yeo l", + "n a", + "e seo", + "y e", + "m yeon", + "tt ae", + "h w", + "j e", + "eob s", + "j ang", + "g u", + "g w", + "il eul", + "yeo g", + "j eon", + "si g", + "j ag", + "j in", + "y u", + "o e", + "s e", + "hag o", + "d eun", + "y a", + "m un", + "s eong", + "g ag", + "h am", + "d ang", + "b a", + "l eul", + "s il", + "do ng", + "kk a", + "b al", + "da l", + "han da", + "eo ssda", + "ae g", + "l i", + "ha ji", + "s eon", + "o ng", + "hae ssda", + "d e", + "i ssda", + "e ge", + "b un", + "m ul", + "ju ng", + "ji g", + "m u", + "iss neun", + "b i", + "g eun", + "seu bnida", + "w on", + "p p", + "d aneun", + "eo h", + "d eo", + "ga m", + "j al", + "hae ng", + "ag o", + "y ang", + "b ul", + "b ang", + "u m", + "s o", + "h i", + "j ae", + "si m", + "saeng gag", + "hag e", + "s og", + "eo ss", + "d an", + "ja sin", + "j il", + "eo g", + "g yeong", + "doe n", + "go ng", + "m i", + "ch i", + "d eu", + "d eon", + "hae ss", + "d u", + "n am", + "eun g", + "jo h", + "n al", + "m yeong", + "w o", + "eon a", + "i go", + "g yeol", + "y ag", + "gw an", + "ul i", + "yo ng", + "n o", + "l yeo", + "j og", + "eoh ge", + "ga t", + "b og", + "mo s", + "t ong", + "ch a", + "man h", + "jeo l", + "geo l", + "h oe", + "ag a", + "n aneun", + "g an", + "un eun", + "ch eol", + "ch e", + "do l", + "b on", + "b an", + "ba d", + "ch u", + "ham yeon", + "yeo ssda", + "i bnida", + "g ye", + "eo s", + "hw al", + "salam deul", + "ji man", + "dang sin", + "ji b", + "ttae mun", + "m ae", + "i b", + "e neun", + "eu g", + "jeo m", + "geul eon", + "h wa", + "a ssda", + "b eob", + "bu t", + "b ae", + "yeo ss", + "ch in", + "ch aeg", + "g eon", + "g ae", + "nae ga", + "i ga", + "m og", + "sig an", + "g il", + "h yeon", + "l yeog", + "gu g", + "p yeon", + "s an", + "w ae", + "j ul", + "s eul", + "deun g", + "haji man", + "eum yeon", + "p il", + "m ol", + "n eu", + "a ss", + "n yeon", + "t ae", + "h u", + "p yo", + "s ul", + "g ang", + "j ineun", + "b eon", + "ha da", + "seo l", + "si p", + "dal eun", + "a p", + "sal m", + "g yo", + "ch eon", + "hag i", + "in a", + "cheol eom", + "g al", + "il a", + "kka ji", + "anh neun", + "ha bnida", + "tt eon", + "n u", + "hae seo", + "doen da", + "s ol", + "tt al", + "l a", + "il o", + "seu b", + "b yeon", + "m yeo", + "b eol", + "s on", + "n un", + "j un", + "j am", + "j eung", + "tt o", + "e n", + "mo m", + "h o", + "ch im", + "hw ang", + "eun eun", + "jo ng", + "bo da", + "n ol", + "n eom", + "but eo", + "jig eum", + "eobs da", + "dae lo", + "i g", + "y ul", + "p yeong", + "seon eun", + "sal ang", + "seu t", + "h im", + "n an", + "h eom", + "h yang", + "p i", + "gw ang", + "eobs neun", + "hw ag", + "ge ss", + "jag i", + "il eon", + "wi hae", + "dae han", + "ga ji", + "m eog", + "j yeo", + "cha j", + "b yeong", + "eo d", + "g yeo", + "do n", + "eo ji", + "g ul", + "mo deun", + "j on", + "in saeng", + "geul ae", + "h ang", + "sa sil", + "si b", + "ch al", + "il ago", + "doe l", + "g eum", + "doe neun", + "b ol", + "ga jang", + "geul igo", + "e l", + "h yeong", + "haeng bog", + "ch ul", + "h on", + "ch ae", + "s am", + "m ang", + "in da", + "da m", + "w ol", + "ch oe", + "d ul", + "si jag", + "ch eong", + "il aneun", + "ul ineun", + "ae n", + "kk e", + "mun je", + "a do", + "t eu", + "g un", + "geun eun", + "b ge", + "ch eo", + "b aeg", + "ju g", + "t a", + "sang dae", + "geu geos", + "do g", + "eu s", + "deu s", + "ja b", + "h yeo", + "tt eohge", + "u g", + "ma j", + "ch il", + "s wi", + "j ileul", + "ch ang", + "g aneun", + "m ag", + "i ji", + "da go", + "m in", + "yo han", + "t eug", + "pp un", + "al eul", + "haeng dong", + "p o", + "m il", + "ch am", + "se sang", + "e do", + "p an", + "man deul", + "am yeon", + "a b", + "kk ae", + "b ag", + "i deul", + "p um", + "m eol", + "s un", + "n eul", + "ham kke", + "chu ng", + "da b", + "yu g", + "s ag", + "gwang ye", + "il eohge", + "bal o", + "neun de", + "ham yeo", + "go s", + "geul eoh", + "an ila", + "bang beob", + "da si", + "b yeol", + "g yeon", + "gam jeong", + "on eul", + "j aneun", + "yeo m", + "l ago", + "i gi", + "hw an", + "t eul", + "eo seo", + "si k", + "ch o", + "jag a", + "geul eom", + "geul eona", + "jeong do", + "g yeog", + "geul eohge", + "geu deul", + "eu t", + "im yeon", + "j jae", + "k eun", + "i sang", + "mal haessda", + "eu ge", + "no p", + "in gan", + "bo myeon", + "t aeg", + "seu s", + "d wi", + "s aneun", + "w an", + "anh go", + "t an", + "nu gu", + "su ng", + "da myeon", + "a deul", + "p eul", + "ttal a", + "d i", + "geos do", + "a ji", + "m eon", + "eum yeo", + "dol og", + "neun g", + "mo du", + "क े", + "ह ै", + "े ं", + "् र", + "ा र", + "न े", + "य ा", + "म ें", + "स े", + "क ी", + "क ा", + "ो ं", + "त ा", + "क र", + "स ्", + "क ि", + "क ो", + "र ्", + "न ा", + "क ्", + "ह ी", + "औ र", + "प र", + "त े", + "ह ो", + "प ्र", + "ा न", + "् य", + "ल ा", + "व ा", + "ल े", + "स ा", + "है ं", + "ल ि", + "ज ा", + "ह ा", + "भ ी", + "व ि", + "इ स", + "त ी", + "न ्", + "र ा", + "म ा", + "द े", + "द ि", + "ब ा", + "त ि", + "थ ा", + "न ि", + "क ार", + "ए क", + "ही ं", + "ह ु", + "ं ग", + "ै ं", + "न ी", + "स ी", + "अ प", + "त ्", + "न हीं", + "र ी", + "म े", + "म ु", + "ि त", + "त ो", + "प ा", + "ल ी", + "लि ए", + "ग ा", + "ल ्", + "र ह", + "र े", + "क् ष", + "म ैं", + "स म", + "उ स", + "ज ि", + "त ्र", + "म ि", + "च ा", + "ो ग", + "स ं", + "द ्", + "स ि", + "आ प", + "त ु", + "द ा", + "क ु", + "य ों", + "व े", + "ज ी", + "् या", + "उ न", + "ि क", + "य े", + "भ ा", + "् ट", + "ह म", + "स् ट", + "श ा", + "ड ़", + "ं द", + "ख ा", + "म ्", + "श ्", + "य ह", + "स क", + "प ू", + "कि या", + "अप ने", + "र ू", + "स ु", + "म ी", + "ह ि", + "ज ो", + "थ े", + "र ि", + "द ी", + "थ ी", + "ग ी", + "ल ोग", + "ग या", + "त र", + "न् ह", + "च ्", + "व ार", + "ब ी", + "प ्", + "द ो", + "ट ी", + "श ि", + "कर ने", + "ग े", + "ै से", + "इ न", + "ं ड", + "सा थ", + "प ु", + "ब े", + "ब ार", + "व ी", + "अ न", + "ह र", + "उ न्ह", + "हो ता", + "ज ब", + "कु छ", + "म ान", + "क ्र", + "ब ि", + "प ह", + "फ ि", + "स र", + "ार ी", + "र ो", + "द ू", + "क हा", + "त क", + "श न", + "ब ्", + "स् थ", + "व ह", + "बा द", + "ओ ं", + "ग ु", + "ज ्", + "्र े", + "ग र", + "रह े", + "व र्", + "ह ू", + "ार ्", + "प ी", + "ब हु", + "मु झ", + "्र ा", + "दि या", + "स ब", + "कर ते", + "अप नी", + "बहु त", + "क ह", + "ट े", + "हु ए", + "कि सी", + "र हा", + "ष ्ट", + "ज ़", + "ब ना", + "स ो", + "ड ि", + "को ई", + "व ्य", + "बा त", + "र ु", + "व ो", + "मुझ े", + "द् ध", + "च ार", + "मे रे", + "व र", + "्र ी", + "जा ता", + "न ों", + "प्र ा", + "दे ख", + "ट ा", + "क् या", + "अ ध", + "ल ग", + "ल ो", + "प ि", + "य ु", + "च े", + "जि स", + "ं त", + "ान ी", + "प ै", + "ज न", + "ार े", + "च ी", + "मि ल", + "द ु", + "दे श", + "च् छ", + "ष ्", + "स ू", + "ख े", + "च ु", + "ि या", + "ल गा", + "ब ु", + "उन के", + "ज् ञ", + "क्ष ा", + "त रह", + "्या दा", + "वा ले", + "पू र्", + "मैं ने", + "का म", + "रू प", + "हो ती", + "उ प", + "ज ान", + "प्र कार", + "भ ार", + "म न", + "हु आ", + "ट र", + "हू ँ", + "पर ि", + "पा स", + "अन ु", + "रा ज", + "लोग ों", + "अ ब", + "सम झ", + "ड ी", + "म ौ", + "श ु", + "च ि", + "प े", + "क ृ", + "सक ते", + "म ह", + "य ोग", + "द र्", + "उ से", + "ं ध", + "ड ा", + "जा ए", + "ब ो", + "ू ल", + "म ो", + "ों ने", + "ं स", + "तु म", + "पह ले", + "ब ता", + "त था", + "य ो", + "ग ई", + "उ त्", + "सक ता", + "क म", + "ज ्यादा", + "र ख", + "सम य", + "ार ा", + "अ गर", + "स् त", + "च ल", + "फि र", + "वार ा", + "कर ना", + "श ी", + "ग ए", + "ब न", + "ौ र", + "हो ने", + "चा ह", + "ख ु", + "हा ँ", + "उन्ह ें", + "उन्ह ोंने", + "छ ो", + "म् ह", + "प्र ति", + "नि क", + "व न", + "्य ू", + "र ही", + "तु म्ह", + "ज ैसे", + "ि यों", + "क् यों", + "ल ों", + "फ ़", + "ं त्र", + "हो ते", + "क् ति", + "त ्य", + "कर ्", + "क ई", + "व ं", + "कि न", + "प ो", + "कार ण", + "ड़ ी", + "भ ि", + "इस के", + "ब र", + "उस के", + "द् वारा", + "श े", + "क ॉ", + "दि न", + "न् न", + "ड़ ा", + "स् व", + "नि र्", + "मु ख", + "लि या", + "ट ि", + "ज्ञ ान", + "क् त", + "द ्र", + "ग ्", + "क् स", + "म ै", + "ग ो", + "ज े", + "ट ्र", + "म ार", + "त् व", + "ध ार", + "भा व", + "कर ता", + "ख ि", + "क ं", + "चा हि", + "य र", + "प् त", + "क ों", + "ं च", + "ज ु", + "म त", + "अ च्छ", + "हु ई", + "क भी", + "ले किन", + "भ ू", + "अप ना", + "दू स", + "चाहि ए", + "य ू", + "घ र", + "सब से", + "मे री", + "ना म", + "ढ ़", + "ं ट", + "ें गे", + "ब ै", + "फ ा", + "ए वं", + "य ी", + "ग ्र", + "क्ष े", + "आ ज", + "आप को", + "भा ग", + "ठ ा", + "क ै", + "भार त", + "उन की", + "प हु", + "स भी", + "ध ा", + "ण ा", + "स ान", + "हो गा", + "त ब", + "स ंग", + "प र्", + "अ व", + "त ना", + "ग ि", + "य न", + "स् था", + "च ित", + "ट ्", + "छ ा", + "जा ने", + "क्षे त्र", + "वा ली", + "पूर् ण", + "स मा", + "कार ी" + ] + } +} \ No newline at end of file diff --git a/models/lyrics_utils/zh_num2words.py b/models/lyrics_utils/zh_num2words.py new file mode 100644 index 0000000..2c6a941 --- /dev/null +++ b/models/lyrics_utils/zh_num2words.py @@ -0,0 +1,1209 @@ +# Authors: +# 2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git) +# 2019.9 - 2022 Jiayu DU +#copy from https://github.com/coqui-ai/TTS/blob/dbf1a08a0d4e47fdad6172e433eeb34bc6b13b4e/TTS/tts/layers/xtts/zh_num2words.py +import argparse +import csv +import os +import re +import string +import sys + +# fmt: off + +# ================================================================================ # +# basic constant +# ================================================================================ # +CHINESE_DIGIS = "零一二三四五六七八九" +BIG_CHINESE_DIGIS_SIMPLIFIED = "零壹贰叁肆伍陆柒捌玖" +BIG_CHINESE_DIGIS_TRADITIONAL = "零壹貳參肆伍陸柒捌玖" +SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = "十百千万" +SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = "拾佰仟萬" +LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = "亿兆京垓秭穰沟涧正载" +LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = "億兆京垓秭穰溝澗正載" +SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = "十百千万" +SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = "拾佰仟萬" + +ZERO_ALT = "〇" +ONE_ALT = "幺" +TWO_ALTS = ["两", "兩"] + +POSITIVE = ["正", "正"] +NEGATIVE = ["负", "負"] +POINT = ["点", "點"] +# PLUS = [u'加', u'加'] +# SIL = [u'杠', u'槓'] + +FILLER_CHARS = ["呃", "啊"] + +ER_WHITELIST = ( + "(儿女|儿子|儿孙|女儿|儿媳|妻儿|" + "胎儿|婴儿|新生儿|婴幼儿|幼儿|少儿|小儿|儿歌|儿童|儿科|托儿所|孤儿|" + "儿戏|儿化|台儿庄|鹿儿岛|正儿八经|吊儿郎当|生儿育女|托儿带女|养儿防老|痴儿呆女|" + "佳儿佳妇|儿怜兽扰|儿无常父|儿不嫌母丑|儿行千里母担忧|儿大不由爷|苏乞儿)" +) +ER_WHITELIST_PATTERN = re.compile(ER_WHITELIST) + +# 中文数字系统类型 +NUMBERING_TYPES = ["low", "mid", "high"] + +CURRENCY_NAMES = "(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|" "里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)" +CURRENCY_UNITS = "((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)" +COM_QUANTIFIERS = ( + "(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|" + "砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|" + "针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|" + "毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|" + "盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|" + "纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)" +) + + +# Punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git) +CN_PUNCS_STOP = "!?。。" +CN_PUNCS_NONSTOP = ""#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏·〈〉-" +CN_PUNCS = CN_PUNCS_STOP + CN_PUNCS_NONSTOP + +PUNCS = CN_PUNCS + string.punctuation +PUNCS_TRANSFORM = str.maketrans(PUNCS, "," * len(PUNCS), "") # replace puncs with English comma + + +# https://zh.wikipedia.org/wiki/全行和半行 +QJ2BJ = { + " ": " ", + "!": "!", + """: '"', + "#": "#", + "$": "$", + "%": "%", + "&": "&", + "'": "'", + "(": "(", + ")": ")", + "*": "*", + "+": "+", + ",": ",", + "-": "-", + ".": ".", + "/": "/", + "0": "0", + "1": "1", + "2": "2", + "3": "3", + "4": "4", + "5": "5", + "6": "6", + "7": "7", + "8": "8", + "9": "9", + ":": ":", + ";": ";", + "<": "<", + "=": "=", + ">": ">", + "?": "?", + "@": "@", + "A": "A", + "B": "B", + "C": "C", + "D": "D", + "E": "E", + "F": "F", + "G": "G", + "H": "H", + "I": "I", + "J": "J", + "K": "K", + "L": "L", + "M": "M", + "N": "N", + "O": "O", + "P": "P", + "Q": "Q", + "R": "R", + "S": "S", + "T": "T", + "U": "U", + "V": "V", + "W": "W", + "X": "X", + "Y": "Y", + "Z": "Z", + "[": "[", + "\": "\\", + "]": "]", + "^": "^", + "_": "_", + "`": "`", + "a": "a", + "b": "b", + "c": "c", + "d": "d", + "e": "e", + "f": "f", + "g": "g", + "h": "h", + "i": "i", + "j": "j", + "k": "k", + "l": "l", + "m": "m", + "n": "n", + "o": "o", + "p": "p", + "q": "q", + "r": "r", + "s": "s", + "t": "t", + "u": "u", + "v": "v", + "w": "w", + "x": "x", + "y": "y", + "z": "z", + "{": "{", + "|": "|", + "}": "}", + "~": "~", +} +QJ2BJ_TRANSFORM = str.maketrans("".join(QJ2BJ.keys()), "".join(QJ2BJ.values()), "") + + +# 2013 China National Standard: https://zh.wikipedia.org/wiki/通用规范汉字表, raw resources: +# https://github.com/mozillazg/pinyin-data/blob/master/kMandarin_8105.txt with 8105 chinese chars in total +CN_CHARS_COMMON = ( + "一丁七万丈三上下不与丏丐丑专且丕世丘丙业丛东丝丞丢两严丧个丫中丰串临丸丹为主丽举" + "乂乃久么义之乌乍乎乏乐乒乓乔乖乘乙乜九乞也习乡书乩买乱乳乸乾了予争事二亍于亏云互" + "亓五井亘亚些亟亡亢交亥亦产亨亩享京亭亮亲亳亵亶亸亹人亿什仁仂仃仄仅仆仇仉今介仍从" + "仑仓仔仕他仗付仙仝仞仟仡代令以仨仪仫们仰仲仳仵件价任份仿企伈伉伊伋伍伎伏伐休众优" + "伙会伛伞伟传伢伣伤伥伦伧伪伫伭伯估伲伴伶伸伺似伽伾佁佃但位低住佐佑体何佖佗佘余佚" + "佛作佝佞佟你佣佤佥佩佬佯佰佳佴佶佸佺佻佼佽佾使侁侂侃侄侈侉例侍侏侑侔侗侘供依侠侣" + "侥侦侧侨侩侪侬侮侯侴侵侹便促俄俅俊俍俎俏俐俑俗俘俙俚俜保俞俟信俣俦俨俩俪俫俭修俯" + "俱俳俵俶俸俺俾倌倍倏倒倓倔倕倘候倚倜倞借倡倥倦倧倨倩倪倬倭倮倴债倻值倾偁偃假偈偌" + "偎偏偓偕做停偡健偬偭偰偲偶偷偻偾偿傀傃傅傈傉傍傒傕傣傥傧储傩催傲傺傻僇僎像僔僖僚" + "僦僧僬僭僮僰僳僵僻儆儇儋儒儡儦儳儴儿兀允元兄充兆先光克免兑兔兕兖党兜兢入全八公六" + "兮兰共关兴兵其具典兹养兼兽冀冁内冈冉册再冏冒冔冕冗写军农冠冢冤冥冬冮冯冰冱冲决况" + "冶冷冻冼冽净凄准凇凉凋凌减凑凓凘凛凝几凡凤凫凭凯凰凳凶凸凹出击凼函凿刀刁刃分切刈" + "刊刍刎刑划刖列刘则刚创初删判刨利别刬刭刮到刳制刷券刹刺刻刽刿剀剁剂剃剅削剋剌前剐" + "剑剔剕剖剜剞剟剡剥剧剩剪副割剽剿劁劂劄劈劐劓力劝办功加务劢劣动助努劫劬劭励劲劳劼" + "劾势勃勇勉勋勍勐勒勔勖勘勚募勠勤勰勺勾勿匀包匆匈匍匏匐匕化北匙匜匝匠匡匣匦匪匮匹" + "区医匼匾匿十千卅升午卉半华协卑卒卓单卖南博卜卞卟占卡卢卣卤卦卧卫卬卮卯印危即却卵" + "卷卸卺卿厂厄厅历厉压厌厍厕厖厘厚厝原厢厣厥厦厨厩厮去厾县叁参叆叇又叉及友双反发叔" + "叕取受变叙叚叛叟叠口古句另叨叩只叫召叭叮可台叱史右叵叶号司叹叻叼叽吁吃各吆合吉吊" + "同名后吏吐向吒吓吕吖吗君吝吞吟吠吡吣否吧吨吩含听吭吮启吱吲吴吵吸吹吻吼吽吾呀呃呆" + "呇呈告呋呐呒呓呔呕呖呗员呙呛呜呢呣呤呦周呱呲味呵呶呷呸呻呼命咀咂咄咆咇咉咋和咍咎" + "咏咐咒咔咕咖咙咚咛咝咡咣咤咥咦咧咨咩咪咫咬咯咱咳咴咸咺咻咽咿哀品哂哃哄哆哇哈哉哌" + "响哎哏哐哑哒哓哔哕哗哙哚哝哞哟哢哥哦哧哨哩哪哭哮哱哲哳哺哼哽哿唁唆唇唉唏唐唑唔唛" + "唝唠唢唣唤唧唪唬售唯唰唱唳唵唷唼唾唿啁啃啄商啉啊啐啕啖啜啡啤啥啦啧啪啫啬啭啮啰啴" + "啵啶啷啸啻啼啾喀喁喂喃善喆喇喈喉喊喋喏喑喔喘喙喜喝喟喤喧喱喳喵喷喹喻喽喾嗄嗅嗉嗌" + "嗍嗐嗑嗒嗓嗔嗖嗜嗝嗞嗟嗡嗣嗤嗥嗦嗨嗪嗫嗬嗯嗲嗳嗵嗷嗽嗾嘀嘁嘈嘉嘌嘎嘏嘘嘚嘛嘞嘟嘡" + "嘣嘤嘧嘬嘭嘱嘲嘴嘶嘹嘻嘿噀噂噇噌噍噎噔噗噘噙噜噢噤器噩噪噫噬噱噶噻噼嚄嚅嚆嚎嚏嚓" + "嚚嚣嚭嚯嚷嚼囊囔囚四回囟因囡团囤囫园困囱围囵囷囹固国图囿圃圄圆圈圉圊圌圐圙圜土圢" + "圣在圩圪圫圬圭圮圯地圲圳圹场圻圾址坂均坉坊坋坌坍坎坏坐坑坒块坚坛坜坝坞坟坠坡坤坥" + "坦坨坩坪坫坬坭坯坰坳坷坻坼坽垂垃垄垆垈型垌垍垎垏垒垓垕垙垚垛垞垟垠垡垢垣垤垦垧垩" + "垫垭垮垯垱垲垴垵垸垺垾垿埂埃埆埇埋埌城埏埒埔埕埗埘埙埚埝域埠埤埪埫埭埯埴埵埸培基" + "埼埽堂堃堆堇堉堋堌堍堎堐堑堕堙堞堠堡堤堧堨堪堰堲堵堼堽堾塄塅塆塌塍塑塔塘塝塞塥填" + "塬塱塾墀墁境墅墈墉墐墒墓墕墘墙墚增墟墡墣墦墨墩墼壁壅壑壕壤士壬壮声壳壶壸壹处备复" + "夏夐夔夕外夙多夜够夤夥大天太夫夬夭央夯失头夷夸夹夺夼奁奂奄奇奈奉奋奎奏契奓奔奕奖" + "套奘奚奠奡奢奥奭女奴奶奸她好妁如妃妄妆妇妈妊妍妒妓妖妗妘妙妞妣妤妥妧妨妩妪妫妭妮" + "妯妲妹妻妾姆姈姊始姐姑姒姓委姗姘姚姜姝姞姣姤姥姨姬姮姱姶姹姻姽姿娀威娃娄娅娆娇娈" + "娉娌娑娓娘娜娟娠娣娥娩娱娲娴娵娶娼婀婆婉婊婌婍婕婘婚婞婠婢婤婧婪婫婳婴婵婶婷婺婻" + "婼婿媂媄媆媒媓媖媚媛媞媪媭媱媲媳媵媸媾嫁嫂嫄嫉嫌嫒嫔嫕嫖嫘嫚嫜嫠嫡嫣嫦嫩嫪嫫嫭嫱" + "嫽嬉嬖嬗嬛嬥嬬嬴嬷嬿孀孅子孑孓孔孕孖字存孙孚孛孜孝孟孢季孤孥学孩孪孬孰孱孳孵孺孽" + "宁它宄宅宇守安宋完宏宓宕宗官宙定宛宜宝实宠审客宣室宥宦宧宪宫宬宰害宴宵家宸容宽宾" + "宿寁寂寄寅密寇富寐寒寓寝寞察寡寤寥寨寮寰寸对寺寻导寿封射将尉尊小少尔尕尖尘尚尜尝" + "尢尤尥尧尨尪尬就尴尸尹尺尻尼尽尾尿局屁层屃居屈屉届屋屎屏屐屑展屙属屠屡屣履屦屯山" + "屹屺屼屾屿岁岂岈岊岌岍岐岑岔岖岗岘岙岚岛岜岞岠岢岣岨岩岫岬岭岱岳岵岷岸岽岿峁峂峃" + "峄峋峒峗峘峙峛峡峣峤峥峦峧峨峪峭峰峱峻峿崀崁崂崃崄崆崇崌崎崒崔崖崚崛崞崟崡崤崦崧" + "崩崭崮崴崶崽崾崿嵁嵅嵇嵊嵋嵌嵎嵖嵘嵚嵛嵝嵩嵫嵬嵯嵲嵴嶂嶅嶍嶒嶓嶙嶝嶟嶦嶲嶷巅巇巉" + "巍川州巡巢工左巧巨巩巫差巯己已巳巴巷巽巾币市布帅帆师希帏帐帑帔帕帖帘帙帚帛帜帝帡" + "带帧帨席帮帱帷常帻帼帽幂幄幅幌幔幕幖幛幞幡幢幪干平年并幸幺幻幼幽广庄庆庇床庋序庐" + "庑库应底庖店庙庚府庞废庠庤庥度座庭庱庳庵庶康庸庹庼庾廆廉廊廋廑廒廓廖廙廛廨廪延廷" + "建廿开弁异弃弄弆弇弈弊弋式弑弓引弗弘弛弟张弢弥弦弧弨弩弭弯弱弶弸弹强弼彀归当录彖" + "彗彘彝彟形彤彦彧彩彪彬彭彰影彳彷役彻彼往征徂径待徇很徉徊律徐徒徕得徘徙徛徜御徨循" + "徭微徵德徼徽心必忆忉忌忍忏忐忑忒忖志忘忙忝忞忠忡忤忧忪快忭忮忱忳念忸忺忻忽忾忿怀" + "态怂怃怄怅怆怊怍怎怏怒怔怕怖怙怛怜思怠怡急怦性怨怩怪怫怯怵总怼怿恁恂恃恋恍恐恒恓" + "恔恕恙恚恝恢恣恤恧恨恩恪恫恬恭息恰恳恶恸恹恺恻恼恽恿悃悄悆悈悉悌悍悒悔悖悚悛悝悟" + "悠悢患悦您悫悬悭悯悰悱悲悴悸悻悼情惆惇惊惋惎惑惔惕惘惙惚惛惜惝惟惠惦惧惨惩惫惬惭" + "惮惯惰想惴惶惹惺愀愁愃愆愈愉愍愎意愐愔愕愚感愠愣愤愦愧愫愭愿慆慈慊慌慎慑慕慝慢慥" + "慧慨慬慭慰慵慷憋憎憔憕憙憧憨憩憬憭憷憺憾懂懈懊懋懑懒懔懦懵懿戆戈戊戋戌戍戎戏成我" + "戒戕或戗战戚戛戟戡戢戣戤戥截戬戭戮戳戴户戽戾房所扁扂扃扅扆扇扈扉扊手才扎扑扒打扔" + "托扛扞扣扦执扩扪扫扬扭扮扯扰扳扶批扺扼扽找承技抃抄抉把抑抒抓抔投抖抗折抚抛抟抠抡" + "抢护报抨披抬抱抵抹抻押抽抿拂拃拄担拆拇拈拉拊拌拍拎拐拒拓拔拖拗拘拙招拜拟拢拣拤拥" + "拦拧拨择括拭拮拯拱拳拴拶拷拼拽拾拿持挂指挈按挎挑挓挖挚挛挝挞挟挠挡挣挤挥挦挨挪挫" + "振挲挹挺挽捂捃捅捆捉捋捌捍捎捏捐捕捞损捡换捣捧捩捭据捯捶捷捺捻捽掀掂掇授掉掊掌掎" + "掏掐排掖掘掞掠探掣接控推掩措掬掭掮掰掳掴掷掸掺掼掾揄揆揉揍描提插揕揖揠握揣揩揪揭" + "揳援揶揸揽揿搀搁搂搅搋搌搏搐搒搓搔搛搜搞搠搡搦搪搬搭搴携搽摁摄摅摆摇摈摊摏摒摔摘" + "摛摞摧摩摭摴摸摹摽撂撄撅撇撑撒撕撖撙撞撤撩撬播撮撰撵撷撸撺撼擀擂擅操擎擐擒擘擞擢" + "擤擦擿攀攉攒攘攥攫攮支收攸改攻攽放政故效敉敌敏救敔敕敖教敛敝敞敢散敦敩敫敬数敲整" + "敷文斋斌斐斑斓斗料斛斜斝斟斠斡斤斥斧斩斫断斯新斶方於施旁旃旄旅旆旋旌旎族旐旒旖旗" + "旞无既日旦旧旨早旬旭旮旯旰旱旴旵时旷旸旺旻旿昀昂昃昄昆昇昈昉昊昌明昏昒易昔昕昙昝" + "星映昡昣昤春昧昨昪昫昭是昱昳昴昵昶昺昼昽显晁晃晅晊晋晌晏晐晒晓晔晕晖晗晙晚晞晟晡" + "晢晤晦晨晪晫普景晰晱晴晶晷智晾暂暄暅暇暌暑暕暖暗暝暧暨暮暲暴暵暶暹暾暿曈曌曙曛曜" + "曝曦曩曰曲曳更曷曹曼曾替最月有朋服朏朐朓朔朕朗望朝期朦木未末本札术朱朳朴朵朸机朽" + "杀杂权杄杆杈杉杌李杏材村杓杕杖杙杜杞束杠条来杧杨杩杪杭杯杰杲杳杵杷杻杼松板极构枅" + "枇枉枋枍析枕林枘枚果枝枞枢枣枥枧枨枪枫枭枯枰枲枳枵架枷枸枹柁柃柄柈柊柏某柑柒染柔" + "柖柘柙柚柜柝柞柠柢查柩柬柯柰柱柳柴柷柽柿栀栅标栈栉栊栋栌栎栏栐树栒栓栖栗栝栟校栩" + "株栲栳栴样核根栻格栽栾桀桁桂桃桄桅框案桉桊桌桎桐桑桓桔桕桠桡桢档桤桥桦桧桨桩桫桯" + "桲桴桶桷桹梁梃梅梆梌梏梓梗梠梢梣梦梧梨梭梯械梳梴梵梼梽梾梿检棁棂棉棋棍棐棒棓棕棘" + "棚棠棣棤棨棪棫棬森棰棱棵棹棺棻棼棽椀椁椅椆椋植椎椐椑椒椓椟椠椤椪椭椰椴椸椹椽椿楂" + "楒楔楗楙楚楝楞楠楣楦楩楪楫楮楯楷楸楹楼概榃榄榅榆榇榈榉榍榑榔榕榖榛榜榧榨榫榭榰榱" + "榴榷榻槁槃槊槌槎槐槔槚槛槜槟槠槭槱槲槽槿樊樗樘樟模樨横樯樱樵樽樾橄橇橐橑橘橙橛橞" + "橡橥橦橱橹橼檀檄檎檐檑檗檞檠檩檫檬櫆欂欠次欢欣欤欧欲欸欹欺欻款歃歅歆歇歉歌歙止正" + "此步武歧歪歹死歼殁殂殃殄殆殇殉殊残殍殒殓殖殚殛殡殣殪殳殴段殷殿毁毂毅毋毌母每毐毒" + "毓比毕毖毗毙毛毡毪毫毯毳毵毹毽氅氆氇氍氏氐民氓气氕氖氘氙氚氛氟氡氢氤氦氧氨氩氪氮" + "氯氰氲水永氾氿汀汁求汆汇汈汉汊汋汐汔汕汗汛汜汝汞江池污汤汧汨汩汪汫汭汰汲汴汶汹汽" + "汾沁沂沃沄沅沆沇沈沉沌沏沐沓沔沘沙沚沛沟没沣沤沥沦沧沨沩沪沫沭沮沱河沸油沺治沼沽" + "沾沿泂泃泄泅泇泉泊泌泐泓泔法泖泗泙泚泛泜泞泠泡波泣泥注泪泫泮泯泰泱泳泵泷泸泺泻泼" + "泽泾洁洄洇洈洋洌洎洑洒洓洗洘洙洚洛洞洢洣津洧洨洪洫洭洮洱洲洳洴洵洸洹洺活洼洽派洿" + "流浃浅浆浇浈浉浊测浍济浏浐浑浒浓浔浕浙浚浛浜浞浟浠浡浣浥浦浩浪浬浭浮浯浰浲浴海浸" + "浼涂涄涅消涉涌涍涎涐涑涓涔涕涘涛涝涞涟涠涡涢涣涤润涧涨涩涪涫涮涯液涴涵涸涿淀淄淅" + "淆淇淋淌淏淑淖淘淙淜淝淞淟淠淡淤淦淫淬淮淯深淳淴混淹添淼清渊渌渍渎渐渑渔渗渚渝渟" + "渠渡渣渤渥温渫渭港渰渲渴游渺渼湃湄湉湍湎湑湓湔湖湘湛湜湝湟湣湫湮湲湴湾湿溁溃溅溆" + "溇溉溍溏源溘溚溜溞溟溠溢溥溦溧溪溯溱溲溴溵溶溷溹溺溻溽滁滂滃滆滇滉滋滍滏滑滓滔滕" + "滗滘滚滞滟滠满滢滤滥滦滧滨滩滪滫滴滹漂漆漈漉漋漏漓演漕漖漠漤漦漩漪漫漭漯漱漳漴漶" + "漷漹漻漼漾潆潇潋潍潏潖潘潜潞潟潢潦潩潭潮潲潴潵潸潺潼潽潾澂澄澈澉澌澍澎澛澜澡澥澧" + "澪澭澳澴澶澹澼澽激濂濉濋濑濒濞濠濡濩濮濯瀌瀍瀑瀔瀚瀛瀣瀱瀵瀹瀼灈灌灏灞火灭灯灰灵" + "灶灸灼灾灿炀炅炆炉炊炌炎炒炔炕炖炘炙炜炝炟炣炫炬炭炮炯炱炳炷炸点炻炼炽烀烁烂烃烈" + "烊烔烘烙烛烜烝烟烠烤烦烧烨烩烫烬热烯烶烷烹烺烻烽焆焉焊焌焐焓焕焖焗焘焙焚焜焞焦焯" + "焰焱然煁煃煅煊煋煌煎煓煜煞煟煤煦照煨煮煲煳煴煸煺煽熄熇熊熏熔熘熙熛熜熟熠熥熨熬熵" + "熹熻燃燊燋燎燏燔燕燚燠燥燧燮燹爆爇爔爚爝爟爨爪爬爰爱爵父爷爸爹爻爽爿牁牂片版牌牍" + "牒牖牙牚牛牝牟牡牢牤牥牦牧物牮牯牲牵特牺牻牾牿犀犁犄犇犊犋犍犏犒犟犨犬犯犰犴状犷" + "犸犹狁狂狃狄狈狉狍狎狐狒狗狙狝狞狠狡狨狩独狭狮狯狰狱狲狳狴狷狸狺狻狼猁猃猄猇猊猎" + "猕猖猗猛猜猝猞猡猢猥猩猪猫猬献猯猰猱猴猷猹猺猾猿獍獐獒獗獠獬獭獯獴獾玃玄率玉王玎" + "玑玒玓玕玖玘玙玚玛玞玟玠玡玢玤玥玦玩玫玭玮环现玱玲玳玶玷玹玺玻玼玿珀珂珅珇珈珉珊" + "珋珌珍珏珐珑珒珕珖珙珛珝珞珠珢珣珥珦珧珩珪珫班珰珲珵珷珸珹珺珽琀球琄琅理琇琈琉琊" + "琎琏琐琔琚琛琟琡琢琤琥琦琨琪琫琬琭琮琯琰琲琳琴琵琶琼瑀瑁瑂瑃瑄瑅瑆瑑瑓瑔瑕瑖瑗瑙" + "瑚瑛瑜瑝瑞瑟瑢瑧瑨瑬瑭瑰瑱瑳瑶瑷瑾璀璁璃璆璇璈璋璎璐璒璘璜璞璟璠璥璧璨璩璪璬璮璱" + "璲璺瓀瓒瓖瓘瓜瓞瓠瓢瓣瓤瓦瓮瓯瓴瓶瓷瓻瓿甄甍甏甑甓甗甘甚甜生甡甥甦用甩甪甫甬甭甯" + "田由甲申电男甸町画甾畀畅畈畋界畎畏畔畖留畚畛畜畤略畦番畬畯畲畴畸畹畿疁疃疆疍疏疐" + "疑疔疖疗疙疚疝疟疠疡疢疣疤疥疫疬疭疮疯疰疱疲疳疴疵疸疹疼疽疾痂痃痄病症痈痉痊痍痒" + "痓痔痕痘痛痞痢痣痤痦痧痨痪痫痰痱痴痹痼痿瘀瘁瘃瘅瘆瘊瘌瘐瘕瘗瘘瘙瘛瘟瘠瘢瘤瘥瘦瘩" + "瘪瘫瘭瘰瘳瘴瘵瘸瘼瘾瘿癀癃癌癍癔癖癗癜癞癣癫癯癸登白百癿皂的皆皇皈皋皎皑皓皕皖皙" + "皛皞皤皦皭皮皱皲皴皿盂盅盆盈盉益盍盎盏盐监盒盔盖盗盘盛盟盥盦目盯盱盲直盷相盹盼盾" + "省眄眇眈眉眊看眍眙眚真眠眢眦眨眩眬眭眯眵眶眷眸眺眼着睁睃睄睇睎睐睑睚睛睡睢督睥睦" + "睨睫睬睹睽睾睿瞀瞄瞅瞋瞌瞍瞎瞑瞒瞟瞠瞢瞥瞧瞩瞪瞫瞬瞭瞰瞳瞵瞻瞽瞿矍矗矛矜矞矢矣知" + "矧矩矫矬短矮矰石矶矸矻矼矾矿砀码砂砄砆砉砌砍砑砒研砖砗砘砚砜砝砟砠砣砥砧砫砬砭砮" + "砰破砵砷砸砹砺砻砼砾础硁硅硇硊硌硍硎硐硒硔硕硖硗硙硚硝硪硫硬硭确硼硿碃碇碈碉碌碍" + "碎碏碑碓碗碘碚碛碜碟碡碣碥碧碨碰碱碲碳碴碶碹碾磁磅磉磊磋磏磐磔磕磙磜磡磨磬磲磴磷" + "磹磻礁礅礌礓礞礴礵示礼社祀祁祃祆祇祈祉祊祋祎祏祐祓祕祖祗祚祛祜祝神祟祠祢祥祧票祭" + "祯祲祷祸祺祼祾禀禁禄禅禊禋福禒禔禘禚禛禤禧禳禹禺离禽禾秀私秃秆秉秋种科秒秕秘租秣" + "秤秦秧秩秫秬秭积称秸移秽秾稀稂稃稆程稌稍税稑稔稗稙稚稞稠稣稳稷稹稻稼稽稿穄穆穑穗" + "穙穜穟穰穴究穷穸穹空穿窀突窃窄窅窈窊窍窎窑窒窕窖窗窘窜窝窟窠窣窥窦窨窬窭窳窸窿立" + "竑竖竘站竞竟章竣童竦竫竭端竹竺竽竿笃笄笆笈笊笋笏笑笔笕笙笛笞笠笤笥符笨笪笫第笮笯" + "笱笳笸笺笼笾筀筅筇等筋筌筏筐筑筒答策筘筚筛筜筝筠筢筤筥筦筮筱筲筵筶筷筹筻筼签简箅" + "箍箐箓箔箕箖算箜管箢箦箧箨箩箪箫箬箭箱箴箸篁篆篇篌篑篓篙篚篝篡篥篦篪篮篯篱篷篼篾" + "簃簇簉簋簌簏簕簖簝簟簠簧簪簰簸簿籀籁籍籥米籴类籼籽粉粑粒粕粗粘粜粝粞粟粢粤粥粪粮" + "粱粲粳粹粼粽精粿糁糅糇糈糊糌糍糒糕糖糗糙糜糟糠糨糯糵系紊素索紧紫累絜絮絷綦綮縠縢" + "縻繁繄繇纂纛纠纡红纣纤纥约级纨纩纪纫纬纭纮纯纰纱纲纳纴纵纶纷纸纹纺纻纼纽纾线绀绁" + "绂练组绅细织终绉绊绋绌绍绎经绐绑绒结绔绕绖绗绘给绚绛络绝绞统绠绡绢绣绤绥绦继绨绩" + "绪绫续绮绯绰绱绲绳维绵绶绷绸绹绺绻综绽绾绿缀缁缂缃缄缅缆缇缈缉缊缌缎缐缑缒缓缔缕" + "编缗缘缙缚缛缜缝缞缟缠缡缢缣缤缥缦缧缨缩缪缫缬缭缮缯缰缱缲缳缴缵缶缸缺罂罄罅罍罐" + "网罔罕罗罘罚罟罡罢罨罩罪置罱署罴罶罹罽罾羁羊羌美羑羓羔羕羖羚羝羞羟羡群羧羯羰羱羲" + "羸羹羼羽羿翀翁翂翃翅翈翊翌翎翔翕翘翙翚翛翟翠翡翥翦翩翮翯翰翱翳翷翻翼翾耀老考耄者" + "耆耇耋而耍耏耐耑耒耔耕耖耗耘耙耜耠耢耤耥耦耧耨耩耪耰耱耳耵耶耷耸耻耽耿聂聃聆聊聋" + "职聍聒联聘聚聩聪聱聿肃肄肆肇肉肋肌肓肖肘肚肛肝肟肠股肢肤肥肩肪肫肭肮肯肱育肴肷肸" + "肺肼肽肾肿胀胁胂胃胄胆胈背胍胎胖胗胙胚胛胜胝胞胠胡胣胤胥胧胨胩胪胫胬胭胯胰胱胲胳" + "胴胶胸胺胼能脂脆脉脊脍脎脏脐脑脒脓脔脖脘脚脞脟脩脬脯脱脲脶脸脾脿腆腈腊腋腌腐腑腒" + "腓腔腕腘腙腚腠腥腧腨腩腭腮腯腰腱腴腹腺腻腼腽腾腿膀膂膈膊膏膑膘膙膛膜膝膦膨膳膺膻" + "臀臂臃臆臊臌臑臜臣臧自臬臭至致臻臼臾舀舁舂舄舅舆舌舍舐舒舔舛舜舞舟舠舢舣舥航舫般" + "舭舯舰舱舲舳舴舵舶舷舸船舻舾艄艅艇艉艋艎艏艘艚艟艨艮良艰色艳艴艺艽艾艿节芃芄芈芊" + "芋芍芎芏芑芒芗芘芙芜芝芟芠芡芣芤芥芦芨芩芪芫芬芭芮芯芰花芳芴芷芸芹芼芽芾苁苄苇苈" + "苉苊苋苌苍苎苏苑苒苓苔苕苗苘苛苜苞苟苠苡苣苤若苦苧苫苯英苴苷苹苻苾茀茁茂范茄茅茆" + "茈茉茋茌茎茏茑茓茔茕茗茚茛茜茝茧茨茫茬茭茯茱茳茴茵茶茸茹茺茼茽荀荁荃荄荆荇草荏荐" + "荑荒荓荔荖荙荚荛荜荞荟荠荡荣荤荥荦荧荨荩荪荫荬荭荮药荷荸荻荼荽莅莆莉莎莒莓莘莙莛" + "莜莝莞莠莨莩莪莫莰莱莲莳莴莶获莸莹莺莼莽莿菀菁菂菅菇菉菊菌菍菏菔菖菘菜菝菟菠菡菥" + "菩菪菰菱菲菹菼菽萁萃萄萆萋萌萍萎萏萑萘萚萜萝萣萤营萦萧萨萩萱萳萸萹萼落葆葎葑葖著" + "葙葚葛葜葡董葩葫葬葭葰葱葳葴葵葶葸葺蒂蒄蒇蒈蒉蒋蒌蒎蒐蒗蒙蒜蒟蒡蒨蒯蒱蒲蒴蒸蒹蒺" + "蒻蒽蒿蓁蓂蓄蓇蓉蓊蓍蓏蓐蓑蓓蓖蓝蓟蓠蓢蓣蓥蓦蓬蓰蓼蓿蔀蔃蔈蔊蔌蔑蔓蔗蔚蔟蔡蔫蔬蔷" + "蔸蔹蔺蔻蔼蔽蕃蕈蕉蕊蕖蕗蕙蕞蕤蕨蕰蕲蕴蕹蕺蕻蕾薁薄薅薇薏薛薜薢薤薨薪薮薯薰薳薷薸" + "薹薿藁藉藏藐藓藕藜藟藠藤藦藨藩藻藿蘅蘑蘖蘘蘧蘩蘸蘼虎虏虐虑虒虓虔虚虞虢虤虫虬虮虱" + "虷虸虹虺虻虼虽虾虿蚀蚁蚂蚄蚆蚊蚋蚌蚍蚓蚕蚜蚝蚣蚤蚧蚨蚩蚪蚬蚯蚰蚱蚲蚴蚶蚺蛀蛃蛄蛆" + "蛇蛉蛊蛋蛎蛏蛐蛑蛔蛘蛙蛛蛞蛟蛤蛩蛭蛮蛰蛱蛲蛳蛴蛸蛹蛾蜀蜂蜃蜇蜈蜉蜊蜍蜎蜐蜒蜓蜕蜗" + "蜘蜚蜜蜞蜡蜢蜣蜥蜩蜮蜱蜴蜷蜻蜾蜿蝇蝈蝉蝌蝎蝓蝗蝘蝙蝠蝣蝤蝥蝮蝰蝲蝴蝶蝻蝼蝽蝾螂螃" + "螅螈螋融螗螟螠螣螨螫螬螭螯螱螳螵螺螽蟀蟆蟊蟋蟏蟑蟒蟛蟠蟥蟪蟫蟮蟹蟾蠃蠊蠋蠓蠕蠖蠡" + "蠢蠲蠹蠼血衃衄衅行衍衎衒衔街衙衠衡衢衣补表衩衫衬衮衰衲衷衽衾衿袁袂袄袅袆袈袋袍袒" + "袖袗袜袢袤袪被袭袯袱袷袼裁裂装裆裈裉裎裒裔裕裘裙裛裟裢裣裤裥裨裰裱裳裴裸裹裼裾褂" + "褊褐褒褓褕褙褚褛褟褡褥褪褫褯褰褴褶襁襄襕襚襜襞襟襦襫襻西要覃覆见观觃规觅视觇览觉" + "觊觋觌觎觏觐觑角觖觚觜觞觟解觥触觫觭觯觱觳觿言訄訇訚訾詈詟詹誉誊誓謇警譬计订讣认" + "讥讦讧讨让讪讫训议讯记讱讲讳讴讵讶讷许讹论讻讼讽设访诀证诂诃评诅识诇诈诉诊诋诌词" + "诎诏诐译诒诓诔试诖诗诘诙诚诛诜话诞诟诠诡询诣诤该详诧诨诩诫诬语诮误诰诱诲诳说诵请" + "诸诹诺读诼诽课诿谀谁谂调谄谅谆谇谈谊谋谌谍谎谏谐谑谒谓谔谕谖谗谙谚谛谜谝谞谟谠谡" + "谢谣谤谥谦谧谨谩谪谫谬谭谮谯谰谱谲谳谴谵谶谷谼谿豁豆豇豉豌豕豚象豢豨豪豫豮豳豸豹" + "豺貂貅貆貉貊貌貔貘贝贞负贡财责贤败账货质贩贪贫贬购贮贯贰贱贲贳贴贵贶贷贸费贺贻贼" + "贽贾贿赀赁赂赃资赅赆赇赈赉赊赋赌赍赎赏赐赑赒赓赔赕赖赗赘赙赚赛赜赝赞赟赠赡赢赣赤" + "赦赧赪赫赭走赳赴赵赶起趁趄超越趋趑趔趟趣趯趱足趴趵趸趺趼趾趿跂跃跄跆跋跌跎跏跐跑" + "跖跗跚跛距跞跟跣跤跨跪跬路跱跳践跶跷跸跹跺跻跽踅踉踊踌踏踒踔踝踞踟踢踣踦踩踪踬踮" + "踯踱踵踶踹踺踽蹀蹁蹂蹄蹅蹇蹈蹉蹊蹋蹐蹑蹒蹙蹚蹜蹢蹦蹩蹬蹭蹯蹰蹲蹴蹶蹼蹽蹾蹿躁躅躇" + "躏躐躔躜躞身躬躯躲躺车轧轨轩轪轫转轭轮软轰轱轲轳轴轵轶轷轸轹轺轻轼载轾轿辀辁辂较" + "辄辅辆辇辈辉辊辋辌辍辎辏辐辑辒输辔辕辖辗辘辙辚辛辜辞辟辣辨辩辫辰辱边辽达辿迁迂迄" + "迅过迈迎运近迓返迕还这进远违连迟迢迤迥迦迨迩迪迫迭迮述迳迷迸迹迺追退送适逃逄逅逆" + "选逊逋逍透逐逑递途逖逗通逛逝逞速造逡逢逦逭逮逯逴逵逶逸逻逼逾遁遂遄遆遇遍遏遐遑遒" + "道遗遘遛遢遣遥遨遭遮遴遵遹遽避邀邂邃邈邋邑邓邕邗邘邙邛邝邠邡邢那邦邨邪邬邮邯邰邱" + "邲邳邴邵邶邸邹邺邻邽邾邿郁郃郄郅郇郈郊郎郏郐郑郓郗郚郛郜郝郡郢郤郦郧部郪郫郭郯郴" + "郸都郾郿鄀鄂鄃鄄鄅鄌鄑鄗鄘鄙鄚鄜鄞鄠鄢鄣鄫鄯鄱鄹酂酃酅酆酉酊酋酌配酎酏酐酒酗酚酝" + "酞酡酢酣酤酥酦酩酪酬酮酯酰酱酲酴酵酶酷酸酹酺酽酾酿醅醇醉醋醌醍醐醑醒醚醛醢醨醪醭" + "醮醯醴醵醺醾采釉释里重野量釐金釜鉴銎銮鋆鋈錾鍪鎏鏊鏖鐾鑫钆钇针钉钊钋钌钍钎钏钐钒" + "钓钔钕钖钗钘钙钚钛钜钝钞钟钠钡钢钣钤钥钦钧钨钩钪钫钬钭钮钯钰钱钲钳钴钵钷钹钺钻钼" + "钽钾钿铀铁铂铃铄铅铆铈铉铊铋铌铍铎铏铐铑铒铕铖铗铘铙铚铛铜铝铞铟铠铡铢铣铤铥铧铨" + "铩铪铫铬铭铮铯铰铱铲铳铴铵银铷铸铹铺铻铼铽链铿销锁锂锃锄锅锆锇锈锉锊锋锌锍锎锏锐" + "锑锒锓锔锕锖锗锘错锚锛锜锝锞锟锡锢锣锤锥锦锧锨锩锪锫锬锭键锯锰锱锲锳锴锵锶锷锸锹" + "锺锻锼锽锾锿镀镁镂镃镄镅镆镇镈镉镊镋镌镍镎镏镐镑镒镓镔镕镖镗镘镚镛镜镝镞镠镡镢镣" + "镤镥镦镧镨镩镪镫镬镭镮镯镰镱镲镳镴镵镶长门闩闪闫闭问闯闰闱闲闳间闵闶闷闸闹闺闻闼" + "闽闾闿阀阁阂阃阄阅阆阇阈阉阊阋阌阍阎阏阐阑阒阔阕阖阗阘阙阚阜队阡阪阮阱防阳阴阵阶" + "阻阼阽阿陀陂附际陆陇陈陉陋陌降陎限陑陔陕陛陞陟陡院除陧陨险陪陬陲陴陵陶陷隃隅隆隈" + "隋隍随隐隔隗隘隙障隧隩隰隳隶隹隺隼隽难雀雁雄雅集雇雉雊雌雍雎雏雒雕雠雨雩雪雯雱雳" + "零雷雹雾需霁霄霅霆震霈霉霍霎霏霓霖霜霞霨霪霭霰露霸霹霾青靓靖静靛非靠靡面靥革靬靰" + "靳靴靶靸靺靼靽靿鞁鞅鞋鞍鞑鞒鞔鞘鞠鞡鞣鞧鞨鞫鞬鞭鞮鞯鞲鞳鞴韂韦韧韨韩韪韫韬韭音韵" + "韶页顶顷顸项顺须顼顽顾顿颀颁颂颃预颅领颇颈颉颊颋颌颍颎颏颐频颓颔颖颗题颙颚颛颜额" + "颞颟颠颡颢颤颥颦颧风飏飐飑飒飓飔飕飗飘飙飞食飧飨餍餐餮饔饕饥饧饨饩饪饫饬饭饮饯饰" + "饱饲饳饴饵饶饷饸饹饺饻饼饽饿馁馃馄馅馆馇馈馉馊馋馌馍馏馐馑馒馓馔馕首馗馘香馝馞馥" + "馧馨马驭驮驯驰驱驲驳驴驵驶驷驸驹驺驻驼驽驾驿骀骁骂骃骄骅骆骇骈骉骊骋验骍骎骏骐骑" + "骒骓骕骖骗骘骙骚骛骜骝骞骟骠骡骢骣骤骥骦骧骨骰骱骶骷骸骺骼髀髁髂髃髅髋髌髎髑髓高" + "髡髢髦髫髭髯髹髻髽鬃鬈鬏鬒鬓鬘鬟鬣鬯鬲鬶鬷鬻鬼魁魂魃魄魅魆魇魈魉魋魍魏魑魔鱼鱽鱾" + "鱿鲀鲁鲂鲃鲅鲆鲇鲈鲉鲊鲋鲌鲍鲎鲏鲐鲑鲒鲔鲕鲖鲗鲘鲙鲚鲛鲜鲝鲞鲟鲠鲡鲢鲣鲤鲥鲦鲧鲨" + "鲩鲪鲫鲬鲭鲮鲯鲰鲱鲲鲳鲴鲵鲷鲸鲹鲺鲻鲼鲽鲾鲿鳀鳁鳂鳃鳄鳅鳇鳈鳉鳊鳌鳍鳎鳏鳐鳑鳒鳓" + "鳔鳕鳖鳗鳘鳙鳚鳛鳜鳝鳞鳟鳠鳡鳢鳣鳤鸟鸠鸡鸢鸣鸤鸥鸦鸧鸨鸩鸪鸫鸬鸭鸮鸯鸰鸱鸲鸳鸵鸶" + "鸷鸸鸹鸺鸻鸼鸽鸾鸿鹀鹁鹂鹃鹄鹅鹆鹇鹈鹉鹊鹋鹌鹍鹎鹏鹐鹑鹒鹔鹕鹖鹗鹘鹙鹚鹛鹜鹝鹞鹟" + "鹠鹡鹢鹣鹤鹦鹧鹨鹩鹪鹫鹬鹭鹮鹯鹰鹱鹲鹳鹴鹾鹿麀麂麇麈麋麑麒麓麖麝麟麦麸麹麻麽麾黄" + "黇黉黍黎黏黑黔默黛黜黝黟黠黡黢黥黧黩黪黯黹黻黼黾鼋鼍鼎鼐鼒鼓鼗鼙鼠鼢鼩鼫鼬鼯鼱鼷" + "鼹鼻鼽鼾齁齇齉齐齑齿龀龁龂龃龄龅龆龇龈龉龊龋龌龙龚龛龟龠龢鿍鿎鿏㑇㑊㕮㘎㙍㙘㙦㛃" + "㛚㛹㟃㠇㠓㤘㥄㧐㧑㧟㫰㬊㬎㬚㭎㭕㮾㰀㳇㳘㳚㴔㵐㶲㸆㸌㺄㻬㽏㿠䁖䂮䃅䃎䅟䌹䎃䎖䏝䏡" + "䏲䐃䓖䓛䓨䓫䓬䗖䗛䗪䗴䜣䝙䢺䢼䣘䥽䦃䲟䲠䲢䴓䴔䴕䴖䴗䴘䴙䶮𠅤𠙶𠳐𡎚𡐓𣗋𣲗𣲘𣸣𤧛𤩽" + "𤫉𥔲𥕢𥖨𥻗𦈡𦒍𦙶𦝼𦭜𦰡𧿹𨐈𨙸𨚕𨟠𨭉𨱇𨱏𨱑𨱔𨺙𩽾𩾃𩾌𪟝𪣻𪤗𪨰𪨶𪩘𪾢𫄧𫄨𫄷𫄸𫇭𫌀𫍣𫍯" + "𫍲𫍽𫐄𫐐𫐓𫑡𫓧𫓯𫓶𫓹𫔍𫔎𫔶𫖮𫖯𫖳𫗧𫗴𫘜𫘝𫘦𫘧𫘨𫘪𫘬𫚕𫚖𫚭𫛭𫞩𫟅𫟦𫟹𫟼𫠆𫠊𫠜𫢸𫫇𫭟" + "𫭢𫭼𫮃𫰛𫵷𫶇𫷷𫸩𬀩𬀪𬂩𬃊𬇕𬇙𬇹𬉼𬊈𬊤𬌗𬍛𬍡𬍤𬒈𬒔𬒗𬕂𬘓𬘘𬘡𬘩𬘫𬘬𬘭𬘯𬙂𬙊𬙋𬜬𬜯𬞟" + "𬟁𬟽𬣙𬣞𬣡𬣳𬤇𬤊𬤝𬨂𬨎𬩽𬪩𬬩𬬭𬬮𬬱𬬸𬬹𬬻𬬿𬭁𬭊𬭎𬭚𬭛𬭤𬭩𬭬𬭯𬭳𬭶𬭸𬭼𬮱𬮿𬯀𬯎𬱖𬱟" + "𬳵𬳶𬳽𬳿𬴂𬴃𬴊𬶋𬶍𬶏𬶐𬶟𬶠𬶨𬶭𬶮𬷕𬸘𬸚𬸣𬸦𬸪𬹼𬺈𬺓" +) +CN_CHARS_EXT = "吶诶屌囧飚屄" + +CN_CHARS = CN_CHARS_COMMON + CN_CHARS_EXT +IN_CH_CHARS = {c: True for c in CN_CHARS} + +EN_CHARS = string.ascii_letters + string.digits +IN_EN_CHARS = {c: True for c in EN_CHARS} + +VALID_CHARS = CN_CHARS + EN_CHARS + " " +IN_VALID_CHARS = {c: True for c in VALID_CHARS} + + +# ================================================================================ # +# basic class +# ================================================================================ # +class ChineseChar(object): + """ + 中文字符 + 每个字符对应简体和繁体, + e.g. 简体 = '负', 繁体 = '負' + 转换时可转换为简体或繁体 + """ + + def __init__(self, simplified, traditional): + self.simplified = simplified + self.traditional = traditional + # self.__repr__ = self.__str__ + + def __str__(self): + return self.simplified or self.traditional or None + + def __repr__(self): + return self.__str__() + + +class ChineseNumberUnit(ChineseChar): + """ + 中文数字/数位字符 + 每个字符除繁简体外还有一个额外的大写字符 + e.g. '陆' 和 '陸' + """ + + def __init__(self, power, simplified, traditional, big_s, big_t): + super(ChineseNumberUnit, self).__init__(simplified, traditional) + self.power = power + self.big_s = big_s + self.big_t = big_t + + def __str__(self): + return "10^{}".format(self.power) + + @classmethod + def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False): + if small_unit: + return ChineseNumberUnit( + power=index + 1, simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1] + ) + elif numbering_type == NUMBERING_TYPES[0]: + return ChineseNumberUnit( + power=index + 8, simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1] + ) + elif numbering_type == NUMBERING_TYPES[1]: + return ChineseNumberUnit( + power=(index + 2) * 4, simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1] + ) + elif numbering_type == NUMBERING_TYPES[2]: + return ChineseNumberUnit( + power=pow(2, index + 3), simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1] + ) + else: + raise ValueError("Counting type should be in {0} ({1} provided).".format(NUMBERING_TYPES, numbering_type)) + + +class ChineseNumberDigit(ChineseChar): + """ + 中文数字字符 + """ + + def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None): + super(ChineseNumberDigit, self).__init__(simplified, traditional) + self.value = value + self.big_s = big_s + self.big_t = big_t + self.alt_s = alt_s + self.alt_t = alt_t + + def __str__(self): + return str(self.value) + + @classmethod + def create(cls, i, v): + return ChineseNumberDigit(i, v[0], v[1], v[2], v[3]) + + +class ChineseMath(ChineseChar): + """ + 中文数位字符 + """ + + def __init__(self, simplified, traditional, symbol, expression=None): + super(ChineseMath, self).__init__(simplified, traditional) + self.symbol = symbol + self.expression = expression + self.big_s = simplified + self.big_t = traditional + + +CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath + + +class NumberSystem(object): + """ + 中文数字系统 + """ + + pass + + +class MathSymbol(object): + """ + 用于中文数字系统的数学符号 (繁/简体), e.g. + positive = ['正', '正'] + negative = ['负', '負'] + point = ['点', '點'] + """ + + def __init__(self, positive, negative, point): + self.positive = positive + self.negative = negative + self.point = point + + def __iter__(self): + for v in self.__dict__.values(): + yield v + + +# class OtherSymbol(object): +# """ +# 其他符号 +# """ +# +# def __init__(self, sil): +# self.sil = sil +# +# def __iter__(self): +# for v in self.__dict__.values(): +# yield v + + +# ================================================================================ # +# basic utils +# ================================================================================ # +def create_system(numbering_type=NUMBERING_TYPES[1]): + """ + 根据数字系统类型返回创建相应的数字系统,默认为 mid + NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型 + low: '兆' = '亿' * '十' = $10^{9}$, '京' = '兆' * '十', etc. + mid: '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc. + high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc. + 返回对应的数字系统 + """ + + # chinese number units of '亿' and larger + all_larger_units = zip(LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL) + larger_units = [CNU.create(i, v, numbering_type, False) for i, v in enumerate(all_larger_units)] + # chinese number units of '十, 百, 千, 万' + all_smaller_units = zip(SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL) + smaller_units = [CNU.create(i, v, small_unit=True) for i, v in enumerate(all_smaller_units)] + # digis + chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS, BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL) + digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)] + digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT + digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT + digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1] + + # symbols + positive_cn = CM(POSITIVE[0], POSITIVE[1], "+", lambda x: x) + negative_cn = CM(NEGATIVE[0], NEGATIVE[1], "-", lambda x: -x) + point_cn = CM(POINT[0], POINT[1], ".", lambda x, y: float(str(x) + "." + str(y))) + # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y))) + system = NumberSystem() + system.units = smaller_units + larger_units + system.digits = digits + system.math = MathSymbol(positive_cn, negative_cn, point_cn) + # system.symbols = OtherSymbol(sil_cn) + return system + + +def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]): + def get_symbol(char, system): + for u in system.units: + if char in [u.traditional, u.simplified, u.big_s, u.big_t]: + return u + for d in system.digits: + if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]: + return d + for m in system.math: + if char in [m.traditional, m.simplified]: + return m + + def string2symbols(chinese_string, system): + int_string, dec_string = chinese_string, "" + for p in [system.math.point.simplified, system.math.point.traditional]: + if p in chinese_string: + int_string, dec_string = chinese_string.split(p) + break + return [get_symbol(c, system) for c in int_string], [get_symbol(c, system) for c in dec_string] + + def correct_symbols(integer_symbols, system): + """ + 一百八 to 一百八十 + 一亿一千三百万 to 一亿 一千万 三百万 + """ + + if integer_symbols and isinstance(integer_symbols[0], CNU): + if integer_symbols[0].power == 1: + integer_symbols = [system.digits[1]] + integer_symbols + + if len(integer_symbols) > 1: + if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU): + integer_symbols.append(CNU(integer_symbols[-2].power - 1, None, None, None, None)) + + result = [] + unit_count = 0 + for s in integer_symbols: + if isinstance(s, CND): + result.append(s) + unit_count = 0 + elif isinstance(s, CNU): + current_unit = CNU(s.power, None, None, None, None) + unit_count += 1 + + if unit_count == 1: + result.append(current_unit) + elif unit_count > 1: + for i in range(len(result)): + if isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power: + result[-i - 1] = CNU(result[-i - 1].power + current_unit.power, None, None, None, None) + return result + + def compute_value(integer_symbols): + """ + Compute the value. + When current unit is larger than previous unit, current unit * all previous units will be used as all previous units. + e.g. '两千万' = 2000 * 10000 not 2000 + 10000 + """ + value = [0] + last_power = 0 + for s in integer_symbols: + if isinstance(s, CND): + value[-1] = s.value + elif isinstance(s, CNU): + value[-1] *= pow(10, s.power) + if s.power > last_power: + value[:-1] = list(map(lambda v: v * pow(10, s.power), value[:-1])) + last_power = s.power + value.append(0) + return sum(value) + + system = create_system(numbering_type) + int_part, dec_part = string2symbols(chinese_string, system) + int_part = correct_symbols(int_part, system) + int_str = str(compute_value(int_part)) + dec_str = "".join([str(d.value) for d in dec_part]) + if dec_part: + return "{0}.{1}".format(int_str, dec_str) + else: + return int_str + + +def num2chn( + number_string, + numbering_type=NUMBERING_TYPES[1], + big=False, + traditional=False, + alt_zero=False, + alt_one=False, + alt_two=True, + use_zeros=True, + use_units=True, +): + def get_value(value_string, use_zeros=True): + striped_string = value_string.lstrip("0") + + # record nothing if all zeros + if not striped_string: + return [] + + # record one digits + elif len(striped_string) == 1: + if use_zeros and len(value_string) != len(striped_string): + return [system.digits[0], system.digits[int(striped_string)]] + else: + return [system.digits[int(striped_string)]] + + # recursively record multiple digits + else: + result_unit = next(u for u in reversed(system.units) if u.power < len(striped_string)) + result_string = value_string[: -result_unit.power] + return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power :]) + + system = create_system(numbering_type) + + int_dec = number_string.split(".") + if len(int_dec) == 1: + int_string = int_dec[0] + dec_string = "" + elif len(int_dec) == 2: + int_string = int_dec[0] + dec_string = int_dec[1] + else: + raise ValueError("invalid input num string with more than one dot: {}".format(number_string)) + + if use_units and len(int_string) > 1: + result_symbols = get_value(int_string) + else: + result_symbols = [system.digits[int(c)] for c in int_string] + dec_symbols = [system.digits[int(c)] for c in dec_string] + if dec_string: + result_symbols += [system.math.point] + dec_symbols + + if alt_two: + liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t, system.digits[2].big_s, system.digits[2].big_t) + for i, v in enumerate(result_symbols): + if isinstance(v, CND) and v.value == 2: + next_symbol = result_symbols[i + 1] if i < len(result_symbols) - 1 else None + previous_symbol = result_symbols[i - 1] if i > 0 else None + if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))): + if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)): + result_symbols[i] = liang + + # if big is True, '两' will not be used and `alt_two` has no impact on output + if big: + attr_name = "big_" + if traditional: + attr_name += "t" + else: + attr_name += "s" + else: + if traditional: + attr_name = "traditional" + else: + attr_name = "simplified" + + result = "".join([getattr(s, attr_name) for s in result_symbols]) + + # if not use_zeros: + # result = result.strip(getattr(system.digits[0], attr_name)) + + if alt_zero: + result = result.replace(getattr(system.digits[0], attr_name), system.digits[0].alt_s) + + if alt_one: + result = result.replace(getattr(system.digits[1], attr_name), system.digits[1].alt_s) + + for i, p in enumerate(POINT): + if result.startswith(p): + return CHINESE_DIGIS[0] + result + + # ^10, 11, .., 19 + if ( + len(result) >= 2 + and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0], SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] + and result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]] + ): + result = result[1:] + + return result + + +# ================================================================================ # +# different types of rewriters +# ================================================================================ # +class Cardinal: + """ + CARDINAL类 + """ + + def __init__(self, cardinal=None, chntext=None): + self.cardinal = cardinal + self.chntext = chntext + + def chntext2cardinal(self): + return chn2num(self.chntext) + + def cardinal2chntext(self): + return num2chn(self.cardinal) + + +class Digit: + """ + DIGIT类 + """ + + def __init__(self, digit=None, chntext=None): + self.digit = digit + self.chntext = chntext + + # def chntext2digit(self): + # return chn2num(self.chntext) + + def digit2chntext(self): + return num2chn(self.digit, alt_two=False, use_units=False) + + +class TelePhone: + """ + TELEPHONE类 + """ + + def __init__(self, telephone=None, raw_chntext=None, chntext=None): + self.telephone = telephone + self.raw_chntext = raw_chntext + self.chntext = chntext + + # def chntext2telephone(self): + # sil_parts = self.raw_chntext.split('') + # self.telephone = '-'.join([ + # str(chn2num(p)) for p in sil_parts + # ]) + # return self.telephone + + def telephone2chntext(self, fixed=False): + if fixed: + sil_parts = self.telephone.split("-") + self.raw_chntext = "".join([num2chn(part, alt_two=False, use_units=False) for part in sil_parts]) + self.chntext = self.raw_chntext.replace("", "") + else: + sp_parts = self.telephone.strip("+").split() + self.raw_chntext = "".join([num2chn(part, alt_two=False, use_units=False) for part in sp_parts]) + self.chntext = self.raw_chntext.replace("", "") + return self.chntext + + +class Fraction: + """ + FRACTION类 + """ + + def __init__(self, fraction=None, chntext=None): + self.fraction = fraction + self.chntext = chntext + + def chntext2fraction(self): + denominator, numerator = self.chntext.split("分之") + return chn2num(numerator) + "/" + chn2num(denominator) + + def fraction2chntext(self): + numerator, denominator = self.fraction.split("/") + return num2chn(denominator) + "分之" + num2chn(numerator) + + +class Date: + """ + DATE类 + """ + + def __init__(self, date=None, chntext=None): + self.date = date + self.chntext = chntext + + # def chntext2date(self): + # chntext = self.chntext + # try: + # year, other = chntext.strip().split('年', maxsplit=1) + # year = Digit(chntext=year).digit2chntext() + '年' + # except ValueError: + # other = chntext + # year = '' + # if other: + # try: + # month, day = other.strip().split('月', maxsplit=1) + # month = Cardinal(chntext=month).chntext2cardinal() + '月' + # except ValueError: + # day = chntext + # month = '' + # if day: + # day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1] + # else: + # month = '' + # day = '' + # date = year + month + day + # self.date = date + # return self.date + + def date2chntext(self): + date = self.date + try: + year, other = date.strip().split("年", 1) + year = Digit(digit=year).digit2chntext() + "年" + except ValueError: + other = date + year = "" + if other: + try: + month, day = other.strip().split("月", 1) + month = Cardinal(cardinal=month).cardinal2chntext() + "月" + except ValueError: + day = date + month = "" + if day: + day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1] + else: + month = "" + day = "" + chntext = year + month + day + self.chntext = chntext + return self.chntext + + +class Money: + """ + MONEY类 + """ + + def __init__(self, money=None, chntext=None): + self.money = money + self.chntext = chntext + + # def chntext2money(self): + # return self.money + + def money2chntext(self): + money = self.money + pattern = re.compile(r"(\d+(\.\d+)?)") + matchers = pattern.findall(money) + if matchers: + for matcher in matchers: + money = money.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext()) + self.chntext = money + return self.chntext + + +class Percentage: + """ + PERCENTAGE类 + """ + + def __init__(self, percentage=None, chntext=None): + self.percentage = percentage + self.chntext = chntext + + def chntext2percentage(self): + return chn2num(self.chntext.strip().strip("百分之")) + "%" + + def percentage2chntext(self): + return "百分之" + num2chn(self.percentage.strip().strip("%")) + + +def normalize_nsw(raw_text): + text = "^" + raw_text + "$" + + # 规范化日期 + pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)") + matchers = pattern.findall(text) + if matchers: + # print('date') + for matcher in matchers: + text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1) + + # 规范化金钱 + pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)") + matchers = pattern.findall(text) + if matchers: + # print('money') + for matcher in matchers: + text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1) + + # 规范化固话/手机号码 + # 手机 + # http://www.jihaoba.com/news/show/13680 + # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198 + # 联通:130、131、132、156、155、186、185、176 + # 电信:133、153、189、180、181、177 + pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D") + matchers = pattern.findall(text) + if matchers: + # print('telephone') + for matcher in matchers: + text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1) + # 固话 + pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D") + matchers = pattern.findall(text) + if matchers: + # print('fixed telephone') + for matcher in matchers: + text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1) + + # 规范化分数 + pattern = re.compile(r"(\d+/\d+)") + matchers = pattern.findall(text) + if matchers: + # print('fraction') + for matcher in matchers: + text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1) + + # 规范化百分数 + text = text.replace("%", "%") + pattern = re.compile(r"(\d+(\.\d+)?%)") + matchers = pattern.findall(text) + if matchers: + # print('percentage') + for matcher in matchers: + text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1) + + # 规范化纯数+量词 + pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS) + matchers = pattern.findall(text) + if matchers: + # print('cardinal+quantifier') + for matcher in matchers: + text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1) + + # 规范化数字编号 + pattern = re.compile(r"(\d{4,32})") + matchers = pattern.findall(text) + if matchers: + # print('digit') + for matcher in matchers: + text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1) + + # 规范化纯数 + pattern = re.compile(r"(\d+(\.\d+)?)") + matchers = pattern.findall(text) + if matchers: + # print('cardinal') + for matcher in matchers: + text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1) + + # restore P2P, O2O, B2C, B2B etc + pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))") + matchers = pattern.findall(text) + if matchers: + # print('particular') + for matcher in matchers: + text = text.replace(matcher[0], matcher[1] + "2" + matcher[2], 1) + + return text.lstrip("^").rstrip("$") + + +def remove_erhua(text): + """ + 去除儿化音词中的儿: + 他女儿在那边儿 -> 他女儿在那边 + """ + + new_str = "" + while re.search("儿", text): + a = re.search("儿", text).span() + remove_er_flag = 0 + + if ER_WHITELIST_PATTERN.search(text): + b = ER_WHITELIST_PATTERN.search(text).span() + if b[0] <= a[0]: + remove_er_flag = 1 + + if remove_er_flag == 0: + new_str = new_str + text[0 : a[0]] + text = text[a[1] :] + else: + new_str = new_str + text[0 : b[1]] + text = text[b[1] :] + + text = new_str + text + return text + + +def remove_space(text): + tokens = text.split() + new = [] + for k, t in enumerate(tokens): + if k != 0: + if IN_EN_CHARS.get(tokens[k - 1][-1]) and IN_EN_CHARS.get(t[0]): + new.append(" ") + new.append(t) + return "".join(new) + + +class TextNorm: + def __init__( + self, + to_banjiao: bool = False, + to_upper: bool = False, + to_lower: bool = False, + remove_fillers: bool = False, + remove_erhua: bool = False, + check_chars: bool = False, + remove_space: bool = False, + cc_mode: str = "", + ): + self.to_banjiao = to_banjiao + self.to_upper = to_upper + self.to_lower = to_lower + self.remove_fillers = remove_fillers + self.remove_erhua = remove_erhua + self.check_chars = check_chars + self.remove_space = remove_space + + self.cc = None + if cc_mode: + from opencc import OpenCC # Open Chinese Convert: pip install opencc + + self.cc = OpenCC(cc_mode) + + def __call__(self, text): + if self.cc: + text = self.cc.convert(text) + + if self.to_banjiao: + text = text.translate(QJ2BJ_TRANSFORM) + + if self.to_upper: + text = text.upper() + + if self.to_lower: + text = text.lower() + + if self.remove_fillers: + for c in FILLER_CHARS: + text = text.replace(c, "") + + if self.remove_erhua: + text = remove_erhua(text) + + text = normalize_nsw(text) + + text = text.translate(PUNCS_TRANSFORM) + + if self.check_chars: + for c in text: + if not IN_VALID_CHARS.get(c): + print(f"WARNING: illegal char {c} in: {text}", file=sys.stderr) + return "" + + if self.remove_space: + text = remove_space(text) + + return text + + +if __name__ == "__main__": + p = argparse.ArgumentParser() + + # normalizer options + p.add_argument("--to_banjiao", action="store_true", help="convert quanjiao chars to banjiao") + p.add_argument("--to_upper", action="store_true", help="convert to upper case") + p.add_argument("--to_lower", action="store_true", help="convert to lower case") + p.add_argument("--remove_fillers", action="store_true", help='remove filler chars such as "呃, 啊"') + p.add_argument("--remove_erhua", action="store_true", help='remove erhua chars such as "他女儿在那边儿 -> 他女儿在那边"') + p.add_argument("--check_chars", action="store_true", help="skip sentences containing illegal chars") + p.add_argument("--remove_space", action="store_true", help="remove whitespace") + p.add_argument( + "--cc_mode", choices=["", "t2s", "s2t"], default="", help="convert between traditional to simplified" + ) + + # I/O options + p.add_argument("--log_interval", type=int, default=10000, help="log interval in number of processed lines") + p.add_argument("--has_key", action="store_true", help="will be deprecated, set --format ark instead") + p.add_argument("--format", type=str, choices=["txt", "ark", "tsv"], default="txt", help="input format") + p.add_argument("ifile", help="input filename, assume utf-8 encoding") + p.add_argument("ofile", help="output filename") + + args = p.parse_args() + + if args.has_key: + args.format = "ark" + + normalizer = TextNorm( + to_banjiao=args.to_banjiao, + to_upper=args.to_upper, + to_lower=args.to_lower, + remove_fillers=args.remove_fillers, + remove_erhua=args.remove_erhua, + check_chars=args.check_chars, + remove_space=args.remove_space, + cc_mode=args.cc_mode, + ) + + normalizer = TextNorm( + to_banjiao=args.to_banjiao, + to_upper=args.to_upper, + to_lower=args.to_lower, + remove_fillers=args.remove_fillers, + remove_erhua=args.remove_erhua, + check_chars=args.check_chars, + remove_space=args.remove_space, + cc_mode=args.cc_mode, + ) + + ndone = 0 + with open(args.ifile, "r", encoding="utf8") as istream, open(args.ofile, "w+", encoding="utf8") as ostream: + if args.format == "tsv": + reader = csv.DictReader(istream, delimiter="\t") + assert "TEXT" in reader.fieldnames + print("\t".join(reader.fieldnames), file=ostream) + + for item in reader: + text = item["TEXT"] + + if text: + text = normalizer(text) + + if text: + item["TEXT"] = text + print("\t".join([item[f] for f in reader.fieldnames]), file=ostream) + + ndone += 1 + if ndone % args.log_interval == 0: + print(f"text norm: {ndone} lines done.", file=sys.stderr, flush=True) + else: + for l in istream: + key, text = "", "" + if args.format == "ark": # KALDI archive, line format: "key text" + cols = l.strip().split(maxsplit=1) + key, text = cols[0], cols[1] if len(cols) == 2 else "" + else: + text = l.strip() + + if text: + text = normalizer(text) + + if text: + if args.format == "ark": + print(key + "\t" + text, file=ostream) + else: + print(text, file=ostream) + + ndone += 1 + if ndone % args.log_interval == 0: + print(f"text norm: {ndone} lines done.", file=sys.stderr, flush=True) + print(f"text norm: {ndone} lines done in total.", file=sys.stderr, flush=True) \ No newline at end of file diff --git a/models/singer_presets.json b/models/singer_presets.json new file mode 100644 index 0000000..f473eb5 --- /dev/null +++ b/models/singer_presets.json @@ -0,0 +1,8 @@ +[ + { + "id": 1, + "name": "Dump Singer 1", + "description": "This is the first singer preset", + "spk_emb_path": "path/to/singer1/spk_emb" + } +] \ No newline at end of file diff --git a/models/transformer_sana_text2music_large_dcae_0319.py b/models/transformer_sana_text2music_large_dcae_0319.py new file mode 100644 index 0000000..bf1f00b --- /dev/null +++ b/models/transformer_sana_text2music_large_dcae_0319.py @@ -0,0 +1,482 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Any, Dict, Optional, Tuple, List, Union + +import torch +import torch.nn.functional as F +from torch import nn + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.utils import BaseOutput, logging, is_torch_version +from diffusers.models.modeling_utils import ModelMixin +from diffusers.models.embeddings import TimestepEmbedding, Timesteps, get_2d_sincos_pos_embed +from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin + + +try: + from .attention import LinearTransformerBlock, t2i_modulate + from .lyrics_utils.lyric_encoder import ConformerEncoder as LyricEncoder +except ImportError: + from attention import LinearTransformerBlock, t2i_modulate + from lyrics_utils.lyric_encoder import ConformerEncoder as LyricEncoder + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +def cross_norm(hidden_states, controlnet_input): + # input N x T x c + mean_hidden_states, std_hidden_states = hidden_states.mean(dim=(1,2), keepdim=True), hidden_states.std(dim=(1,2), keepdim=True) + mean_controlnet_input, std_controlnet_input = controlnet_input.mean(dim=(1,2), keepdim=True), controlnet_input.std(dim=(1,2), keepdim=True) + controlnet_input = (controlnet_input - mean_controlnet_input) * (std_hidden_states / (std_controlnet_input + 1e-12)) + mean_hidden_states + return controlnet_input + + +# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2 +class Qwen2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +class T2IFinalLayer(nn.Module): + """ + The final layer of Sana. + """ + + def __init__(self, hidden_size, patch_size=[16, 1], out_channels=256): + super().__init__() + self.norm_final = nn.RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.linear = nn.Linear(hidden_size, patch_size[0] * patch_size[1] * out_channels, bias=True) + self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size**0.5) + self.out_channels = out_channels + self.patch_size = patch_size + + def unpatchfy( + self, + hidden_states: torch.Tensor, + width: int, + ): + # 4 unpatchify + new_height, new_width = 1, hidden_states.size(1) + hidden_states = hidden_states.reshape( + shape=(hidden_states.shape[0], new_height, new_width, self.patch_size[0], self.patch_size[1], self.out_channels) + ).contiguous() + hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states) + output = hidden_states.reshape( + shape=(hidden_states.shape[0], self.out_channels, new_height * self.patch_size[0], new_width * self.patch_size[1]) + ).contiguous() + if width > new_width: + output = torch.nn.functional.pad(output, (0, width - new_width, 0, 0), 'constant', 0) + elif width < new_width: + output = output[:, :, :, :width] + return output + + def forward(self, x, t, output_length): + shift, scale = (self.scale_shift_table[None] + t[:, None]).chunk(2, dim=1) + x = t2i_modulate(self.norm_final(x), shift, scale) + x = self.linear(x) + # unpatchify + output = self.unpatchfy(x, output_length) + return output + + +class PatchEmbed(nn.Module): + """2D Image to Patch Embedding""" + + def __init__( + self, + height=16, + width=4096, + patch_size=(16, 1), + in_channels=8, + embed_dim=1152, + bias=True, + ): + super().__init__() + patch_size_h, patch_size_w = patch_size + self.early_conv_layers = nn.Sequential( + nn.Conv2d(in_channels, in_channels*256, kernel_size=patch_size, stride=patch_size, padding=0, bias=bias), + torch.nn.GroupNorm(num_groups=32, num_channels=in_channels*256, eps=1e-6, affine=True), + nn.Conv2d(in_channels*256, embed_dim, kernel_size=1, stride=1, padding=0, bias=bias) + ) + self.patch_size = patch_size + self.height, self.width = height // patch_size_h, width // patch_size_w + self.base_size = self.width + + def forward(self, latent): + # early convolutions, N x C x H x W -> N x 256 * sqrt(patch_size) x H/patch_size x W/patch_size + latent = self.early_conv_layers(latent) + latent = latent.flatten(2).transpose(1, 2) # BCHW -> BNC + return latent + + +@dataclass +class Transformer1DModelOutput(BaseOutput): + + sample: torch.FloatTensor + proj_losses: Optional[Tuple[Tuple[str, torch.Tensor]]] = None + + +class ACEFlowBaseModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin): + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + in_channels: Optional[int] = 8, + num_layers: int = 28, + inner_dim: int = 1536, + attention_head_dim: int = 64, + num_attention_heads: int = 24, + mlp_ratio: float = 4.0, + out_channels: int = 8, + max_position: int = 32768, + rope_theta: float = 1000000.0, + speaker_embedding_dim: int = 512, + text_embedding_dim: int = 768, + ssl_encoder_depths: List[int] = [9, 9], + ssl_names: List[str] = ["mert", "m-hubert"], + ssl_latent_dims: List[int] = [1024, 768], + lyric_encoder_vocab_size: int = 6681, + lyric_hidden_size: int = 1024, + patch_size: List[int] = [16, 1], + max_height: int = 16, + max_width: int = 4096, + **kwargs, + ): + super().__init__() + + self.num_attention_heads = num_attention_heads + self.attention_head_dim = attention_head_dim + inner_dim = num_attention_heads * attention_head_dim + self.inner_dim = inner_dim + self.out_channels = out_channels + self.max_position = max_position + self.patch_size = patch_size + + self.rope_theta = rope_theta + + self.rotary_emb = Qwen2RotaryEmbedding( + dim=self.attention_head_dim, + max_position_embeddings=self.max_position, + base=self.rope_theta, + ) + + # 2. Define input layers + self.in_channels = in_channels + + # 3. Define transformers blocks + self.transformer_blocks = nn.ModuleList( + [ + LinearTransformerBlock( + dim=self.inner_dim, + num_attention_heads=self.num_attention_heads, + attention_head_dim=attention_head_dim, + mlp_ratio=mlp_ratio, + add_cross_attention=True, + add_cross_attention_dim=self.inner_dim, + ) + for i in range(self.config.num_layers) + ] + ) + self.num_layers = num_layers + + self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0) + self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=self.inner_dim) + self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(self.inner_dim, 6 * self.inner_dim, bias=True)) + + # speaker + self.speaker_embedder = nn.Linear(speaker_embedding_dim, self.inner_dim) + + # genre + self.genre_embedder = nn.Linear(text_embedding_dim, self.inner_dim) + + # lyric + self.lyric_embs = nn.Embedding(lyric_encoder_vocab_size, lyric_hidden_size) + self.lyric_encoder = LyricEncoder(input_size=lyric_hidden_size, static_chunk_size=0) + self.lyric_proj = nn.Linear(lyric_hidden_size, self.inner_dim) + + projector_dim = 2 * self.inner_dim + + self.projectors = nn.ModuleList([ + nn.Sequential( + nn.Linear(self.inner_dim, projector_dim), + nn.SiLU(), + nn.Linear(projector_dim, projector_dim), + nn.SiLU(), + nn.Linear(projector_dim, ssl_dim), + ) for ssl_dim in ssl_latent_dims + ]) + + self.ssl_latent_dims = ssl_latent_dims + self.ssl_encoder_depths = ssl_encoder_depths + + self.cosine_loss = torch.nn.CosineEmbeddingLoss(margin=0.0, reduction='mean') + self.ssl_names = ssl_names + + self.proj_in = PatchEmbed( + height=max_height, + width=max_width, + patch_size=patch_size, + embed_dim=self.inner_dim, + bias=True, + ) + + self.final_layer = T2IFinalLayer(self.inner_dim, patch_size=patch_size, out_channels=out_channels) + self.gradient_checkpointing = False + + # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking + def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None: + """ + Sets the attention processor to use [feed forward + chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers). + + Parameters: + chunk_size (`int`, *optional*): + The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually + over each tensor of dim=`dim`. + dim (`int`, *optional*, defaults to `0`): + The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch) + or dim=1 (sequence length). + """ + if dim not in [0, 1]: + raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}") + + # By default chunk size is 1 + chunk_size = chunk_size or 1 + + def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int): + if hasattr(module, "set_chunk_feed_forward"): + module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim) + + for child in module.children(): + fn_recursive_feed_forward(child, chunk_size, dim) + + for module in self.children(): + fn_recursive_feed_forward(module, chunk_size, dim) + + def _set_gradient_checkpointing(self, module, value=False): + if hasattr(module, "gradient_checkpointing"): + module.gradient_checkpointing = value + + def forward_lyric_encoder( + self, + lyric_token_idx: Optional[torch.LongTensor] = None, + lyric_mask: Optional[torch.LongTensor] = None, + ): + # N x T x D + lyric_embs = self.lyric_embs(lyric_token_idx) + prompt_prenet_out, _mask = self.lyric_encoder(lyric_embs, lyric_mask, decoding_chunk_size=1, num_decoding_left_chunks=-1) + prompt_prenet_out = self.lyric_proj(prompt_prenet_out) + return prompt_prenet_out + + def encode( + self, + encoder_text_hidden_states: Optional[torch.Tensor] = None, + text_attention_mask: Optional[torch.LongTensor] = None, + speaker_embeds: Optional[torch.FloatTensor] = None, + lyric_token_idx: Optional[torch.LongTensor] = None, + lyric_mask: Optional[torch.LongTensor] = None, + ): + + bs = encoder_text_hidden_states.shape[0] + device = encoder_text_hidden_states.device + + # speaker embedding + encoder_spk_hidden_states = self.speaker_embedder(speaker_embeds).unsqueeze(1) + speaker_mask = torch.ones(bs, 1, device=device) + + # genre embedding + encoder_text_hidden_states = self.genre_embedder(encoder_text_hidden_states) + + # lyric + encoder_lyric_hidden_states = self.forward_lyric_encoder( + lyric_token_idx=lyric_token_idx, + lyric_mask=lyric_mask, + ) + + encoder_hidden_states = torch.cat([encoder_spk_hidden_states, encoder_text_hidden_states, encoder_lyric_hidden_states], dim=1) + encoder_hidden_mask = torch.cat([speaker_mask, text_attention_mask, lyric_mask], dim=1) + return encoder_hidden_states, encoder_hidden_mask + + def decode( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + encoder_hidden_states: torch.Tensor, + encoder_hidden_mask: torch.Tensor, + timestep: Optional[torch.Tensor], + ssl_hidden_states: Optional[List[torch.Tensor]] = None, + output_length: int = 0, + block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None, + controlnet_scale: Union[float, torch.Tensor] = 1.0, + return_dict: bool = True, + ): + + embedded_timestep = self.timestep_embedder(self.time_proj(timestep).to(dtype=hidden_states.dtype)) + temb = self.t_block(embedded_timestep) + + hidden_states = self.proj_in(hidden_states) + + # controlnet logic + if block_controlnet_hidden_states is not None: + control_condi = cross_norm(hidden_states, block_controlnet_hidden_states) + hidden_states = hidden_states + control_condi * controlnet_scale + + inner_hidden_states = [] + + rotary_freqs_cis = self.rotary_emb(hidden_states, seq_len=hidden_states.shape[1]) + encoder_rotary_freqs_cis = self.rotary_emb(encoder_hidden_states, seq_len=encoder_hidden_states.shape[1]) + + for index_block, block in enumerate(self.transformer_blocks): + + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(block), + hidden_states=hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_hidden_mask, + rotary_freqs_cis=rotary_freqs_cis, + rotary_freqs_cis_cross=encoder_rotary_freqs_cis, + temb=temb, + **ckpt_kwargs, + ) + + else: + hidden_states = block( + hidden_states=hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_hidden_mask, + rotary_freqs_cis=rotary_freqs_cis, + rotary_freqs_cis_cross=encoder_rotary_freqs_cis, + temb=temb, + ) + + for ssl_encoder_depth in self.ssl_encoder_depths: + if index_block == ssl_encoder_depth: + inner_hidden_states.append(hidden_states) + + proj_losses = [] + if len(inner_hidden_states) > 0 and ssl_hidden_states is not None and len(ssl_hidden_states) > 0: + + for inner_hidden_state, projector, ssl_hidden_state, ssl_name in zip(inner_hidden_states, self.projectors, ssl_hidden_states, self.ssl_names): + if ssl_hidden_state is None: + continue + # 1. N x T x D1 -> N x D x D2 + est_ssl_hidden_state = projector(inner_hidden_state) + # 3. projection loss + bs = inner_hidden_state.shape[0] + proj_loss = 0.0 + for i, (z, z_tilde) in enumerate(zip(ssl_hidden_state, est_ssl_hidden_state)): + # 2. interpolate + z_tilde = F.interpolate(z_tilde.unsqueeze(0).transpose(1, 2), size=len(z), mode='linear', align_corners=False).transpose(1, 2).squeeze(0) + + z_tilde = torch.nn.functional.normalize(z_tilde, dim=-1) + z = torch.nn.functional.normalize(z, dim=-1) + # T x d -> T x 1 -> 1 + target = torch.ones(z.shape[0], device=z.device) + proj_loss += self.cosine_loss(z, z_tilde, target) + proj_losses.append((ssl_name, proj_loss / bs)) + + output = self.final_layer(hidden_states, embedded_timestep, output_length) + if not return_dict: + return (output, proj_losses) + + return Transformer1DModelOutput(sample=output, proj_losses=proj_losses) + + # @torch.compile + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + encoder_text_hidden_states: Optional[torch.Tensor] = None, + text_attention_mask: Optional[torch.LongTensor] = None, + speaker_embeds: Optional[torch.FloatTensor] = None, + lyric_token_idx: Optional[torch.LongTensor] = None, + lyric_mask: Optional[torch.LongTensor] = None, + timestep: Optional[torch.Tensor] = None, + ssl_hidden_states: Optional[List[torch.Tensor]] = None, + block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None, + controlnet_scale: Union[float, torch.Tensor] = 1.0, + return_dict: bool = True, + ): + encoder_hidden_states, encoder_hidden_mask = self.encode( + encoder_text_hidden_states=encoder_text_hidden_states, + text_attention_mask=text_attention_mask, + speaker_embeds=speaker_embeds, + lyric_token_idx=lyric_token_idx, + lyric_mask=lyric_mask, + ) + + output_length = hidden_states.shape[-1] + + output = self.decode( + hidden_states=hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_hidden_mask=encoder_hidden_mask, + timestep=timestep, + ssl_hidden_states=ssl_hidden_states, + output_length=output_length, + block_controlnet_hidden_states=block_controlnet_hidden_states, + controlnet_scale=controlnet_scale, + return_dict=return_dict, + ) + + return output diff --git a/music_dcae/__init__.py b/music_dcae/__init__.py new file mode 100644 index 0000000..3f4bfdd --- /dev/null +++ b/music_dcae/__init__.py @@ -0,0 +1,2 @@ +from .music_dcae import MusicDCAE +from .music_log_mel import LogMelSpectrogram, get_mel_transform diff --git a/music_dcae/balancer.py b/music_dcae/balancer.py new file mode 100644 index 0000000..06fe369 --- /dev/null +++ b/music_dcae/balancer.py @@ -0,0 +1,92 @@ +import torch +from torch.autograd import grad + + +class Balancer: + """ + Balancer for dynamically re-weighting multiple losses based on gradient norms. + + Args: + weights (dict): Predefined weights for each loss. + Example: {"mse_loss": 1.0, "adv_loss": 1.0} + ema_decay (float): Decay factor for exponential moving average (default: 0.99). + epsilon (float): Small value to avoid division by zero (default: 1e-8). + """ + def __init__(self, weights, ema_decay=0.99, epsilon=1e-8): + self.weights = weights + self.ema_decay = ema_decay + self.epsilon = epsilon + self.ema_values = {key: 0.0 for key in weights} # Initialize EMA for each loss + + def forward(self, losses, grad_inputs): + """ + Re-weight the input losses based on gradient norms and return a combined loss. + + Args: + losses (dict): Dictionary of losses with names as keys and loss tensors as values. + Example: {"mse_loss": mse_loss, "adv_loss": adv_loss} + grad_inputs (dict): Dictionary of inputs for autograd.grad corresponding to each loss. + Example: {"mse_loss": recon_mels, "adv_loss": recon_mels} + + Returns: + torch.Tensor: Combined weighted loss. + """ + # Validate inputs + if set(losses.keys()) != set(grad_inputs.keys()): + raise ValueError("Keys of losses and grad_inputs must match.") + + norm_values = {} + + # Compute gradient norms for each loss + for name, loss in losses.items(): + loss_grad, = grad(loss.mean(), [grad_inputs[name]], create_graph=True) + dims = tuple(range(1, loss_grad.ndim)) # Exclude batch dimension + grad_norm = torch.linalg.vector_norm(loss_grad, ord=2, dim=dims).mean() + + # Update EMA for the gradient norm + if self.ema_values[name] == 0.0: + self.ema_values[name] = grad_norm.item() + else: + self.ema_values[name] = ( + self.ema_values[name] * self.ema_decay + grad_norm.item() * (1 - self.ema_decay) + ) + + # Normalize gradient norm + norm_values[name] = grad_norm / (self.ema_values[name] + self.epsilon) + + # Compute dynamic weights + total_norm = sum(norm_values.values()) + dynamic_weights = {name: norm / total_norm for name, norm in norm_values.items()} + + # Combine losses with dynamic weights + loss = 0.0 + log_weights = {} + for name in losses: + loss = loss + self.weights[name] * dynamic_weights[name] * losses[name] + log_weights[f"{name}_weight"] = dynamic_weights[name] + return loss, log_weights + + +if __name__ == "__main__": + # Example usage + mel_real = torch.randn(1, 80, 10) + generator = torch.nn.Linear(10, 10) + recon_mels = generator(mel_real) + discriminator = torch.nn.Linear(10, 1) + disc_out = discriminator(recon_mels) + + mse_loss = torch.nn.functional.mse_loss(recon_mels, mel_real).mean() + adv_loss = torch.nn.functional.softplus(-disc_out).mean() + losses = {"mse_loss": mse_loss, "adv_loss": adv_loss} + grad_inputs = {"mse_loss": recon_mels, "adv_loss": recon_mels} + print("losses", losses) + # Define predefined weights for each loss + weights = {"mse_loss": 1.0, "adv_loss": 1.0} + + # Initialize balancer + balancer = Balancer(weights) + + # Forward pass + loss, log_weights = balancer.forward(losses, grad_inputs) + print("Combined Loss:", loss) + print("Dynamic Weights:", log_weights) diff --git a/music_dcae/config_f8c8.json b/music_dcae/config_f8c8.json new file mode 100644 index 0000000..b8febd2 --- /dev/null +++ b/music_dcae/config_f8c8.json @@ -0,0 +1,69 @@ +{ + "_class_name": "AutoencoderDC", + "_diffusers_version": "0.32.1", + "_name_or_path": "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers", + "attention_head_dim": 32, + "decoder_act_fns": "silu", + "decoder_block_out_channels": [ + 128, + 256, + 512, + 1024 + ], + "decoder_block_types": [ + "ResBlock", + "ResBlock", + "ResBlock", + "EfficientViTBlock" + ], + "decoder_layers_per_block": [ + 3, + 3, + 3, + 3 + ], + "decoder_norm_types": "rms_norm", + "decoder_qkv_multiscales": [ + [], + [], + [ + 5 + ], + [ + 5 + ] + ], + "downsample_block_type": "Conv", + "encoder_block_out_channels": [ + 128, + 256, + 512, + 1024 + ], + "encoder_block_types": [ + "ResBlock", + "ResBlock", + "ResBlock", + "EfficientViTBlock" + ], + "encoder_layers_per_block": [ + 2, + 2, + 3, + 3 + ], + "encoder_qkv_multiscales": [ + [], + [], + [ + 5 + ], + [ + 5 + ] + ], + "in_channels": 2, + "latent_channels": 8, + "scaling_factor": 0.41407, + "upsample_block_type": "interpolate" +} diff --git a/music_dcae/distrib.py b/music_dcae/distrib.py new file mode 100644 index 0000000..60b0a82 --- /dev/null +++ b/music_dcae/distrib.py @@ -0,0 +1,124 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Torch distributed utilities.""" + +import typing as tp + +import torch + + +def rank(): + if torch.distributed.is_initialized(): + return torch.distributed.get_rank() + else: + return 0 + + +def world_size(): + if torch.distributed.is_initialized(): + return torch.distributed.get_world_size() + else: + return 1 + + +def is_distributed(): + return world_size() > 1 + + +def all_reduce(tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM): + if is_distributed(): + return torch.distributed.all_reduce(tensor, op) + + +def _is_complex_or_float(tensor): + return torch.is_floating_point(tensor) or torch.is_complex(tensor) + + +def _check_number_of_params(params: tp.List[torch.Tensor]): + # utility function to check that the number of params in all workers is the same, + # and thus avoid a deadlock with distributed all reduce. + if not is_distributed() or not params: + return + tensor = torch.tensor([len(params)], device=params[0].device, dtype=torch.long) + all_reduce(tensor) + if tensor.item() != len(params) * world_size(): + # If not all the workers have the same number, for at least one of them, + # this inequality will be verified. + raise RuntimeError(f"Mismatch in number of params: ours is {len(params)}, " + "at least one worker has a different one.") + + +def broadcast_tensors(tensors: tp.Iterable[torch.Tensor], src: int = 0): + """Broadcast the tensors from the given parameters to all workers. + This can be used to ensure that all workers have the same model to start with. + """ + if not is_distributed(): + return + tensors = [tensor for tensor in tensors if _is_complex_or_float(tensor)] + _check_number_of_params(tensors) + handles = [] + for tensor in tensors: + handle = torch.distributed.broadcast(tensor.data, src=src, async_op=True) + handles.append(handle) + for handle in handles: + handle.wait() + + +def sync_buffer(buffers, average=True): + """ + Sync grad for buffers. If average is False, broadcast instead of averaging. + """ + if not is_distributed(): + return + handles = [] + for buffer in buffers: + if torch.is_floating_point(buffer.data): + if average: + handle = torch.distributed.all_reduce( + buffer.data, op=torch.distributed.ReduceOp.SUM, async_op=True) + else: + handle = torch.distributed.broadcast( + buffer.data, src=0, async_op=True) + handles.append((buffer, handle)) + for buffer, handle in handles: + handle.wait() + if average: + buffer.data /= world_size + + +def sync_grad(params): + """ + Simpler alternative to DistributedDataParallel, that doesn't rely + on any black magic. For simple models it can also be as fast. + Just call this on your model parameters after the call to backward! + """ + if not is_distributed(): + return + handles = [] + for p in params: + if p.grad is not None: + handle = torch.distributed.all_reduce( + p.grad.data, op=torch.distributed.ReduceOp.SUM, async_op=True) + handles.append((p, handle)) + for p, handle in handles: + handle.wait() + p.grad.data /= world_size() + + +def average_metrics(metrics: tp.Dict[str, float], count=1.): + """Average a dictionary of metrics across all workers, using the optional + `count` as unnormalized weight. + """ + if not is_distributed(): + return metrics + keys, values = zip(*metrics.items()) + device = 'cuda' if torch.cuda.is_available() else 'cpu' + tensor = torch.tensor(list(values) + [1], device=device, dtype=torch.float32) + tensor *= count + all_reduce(tensor) + averaged = (tensor[:-1] / tensor[-1]).cpu().tolist() + return dict(zip(keys, averaged)) \ No newline at end of file diff --git a/music_dcae/music_dcae.py b/music_dcae/music_dcae.py new file mode 100644 index 0000000..32579de --- /dev/null +++ b/music_dcae/music_dcae.py @@ -0,0 +1,78 @@ +import torch +import torch.nn as nn +from diffusers import AutoencoderDC +import json + + +DEFAULT_CONFIG_PATH = "/root/sag_train/music_dcae/config_f32c32_large.json" + +class MusicDCAE(nn.Module): + def __init__(self, config_path=DEFAULT_CONFIG_PATH): + super(MusicDCAE, self).__init__() + with open(config_path) as f: + config = json.load(f) + self.dcae = AutoencoderDC(**config) + + def encode(self, x): + return self.dcae.encode(x).latent + + def decode(self, latent): + sample = self.dcae.decode(latent).sample + return sample + + def forward(self, x): + sample = self.dcae(x).sample + return sample + + def return_middle_layers(self): + last_down_block = self.dcae.encoder.down_blocks[-1] + encoder_conv_out = self.dcae.encoder.conv_out + decoder_conv_in = self.dcae.decoder.conv_in + decoder_up_blocks = self.dcae.decoder.up_blocks[0] + middle_layers = [last_down_block, encoder_conv_out, decoder_conv_in, decoder_up_blocks] + return middle_layers + + def return_head_layers(self): + decoder_up_blocks = self.dcae.decoder.up_blocks[-1] + conv_out = self.dcae.decoder.conv_out + head_layers = [decoder_up_blocks, conv_out] + return head_layers + + +if __name__ == "__main__": + model = MusicDCAE("/root/sag_train/music_dcae/config_f8c8_large.json") + + x = torch.randn(1, 2, 128, 1024) + # mask = None + # if mask is None: + # mask = torch.ones(x.shape[0], 1, x.shape[2], x.shape[3]).to(x.device) + # # N x 1024 + # elif len(mask.shape) == 2: + # mask = mask.unsqueeze(1).unsqueeze(1).float() + # mask = mask.repeat(1, 1, x.shape[2], 1) + latent = model.encode(x) + print("latent shape: ", latent.shape) + y = model(x) + print("y", y.shape) + total_params = sum(p.numel() for p in model.parameters()) + print(f"模型参数总数: {total_params / 1e6:.2f}M") + + # middle_layers = model.return_middle_layers() + # middle_params_count = 0 + # for layer in middle_layers: + # for name, param in layer.named_parameters(): + # layer_param_count = param.numel() + # middle_params_count += layer_param_count + # print(f"{name}: {param.shape}, 参数量: {layer_param_count/1e6:.2f}M") + + # print(f"中间层总参数量: {middle_params_count/1e6:.2f}M") + + # head_layers = model.return_head_layers() + # head_params_count = 0 + # for layer in head_layers: + # for name, param in layer.named_parameters(): + # layer_param_count = param.numel() + # head_params_count += layer_param_count + # print(f"{name}: {param.shape}, 参数量: {layer_param_count/1e6:.2f}M") + + # print(f"头部层总参数量: {head_params_count/1e6:.2f}M") \ No newline at end of file diff --git a/music_dcae/music_dcae_pipeline.py b/music_dcae/music_dcae_pipeline.py new file mode 100644 index 0000000..a3fd7db --- /dev/null +++ b/music_dcae/music_dcae_pipeline.py @@ -0,0 +1,155 @@ +import torch +import torch.nn as nn +from diffusers import AutoencoderDC +import torchaudio +import torchvision.transforms as transforms +import torchaudio + +try: + from .music_log_mel import get_mel_transform + from .music_vocoder import ADaMoSHiFiGANV1 +except ImportError: + from music_log_mel import get_mel_transform + from music_vocoder import ADaMoSHiFiGANV1 + +import os + +root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +DEFAULT_PRETRAINED_PATH = os.path.join(root_dir, "checkpoints", "music_dcae_f8c8") +VOCODER_PRETRAINED_PATH = os.path.join(root_dir, "checkpoints", "music_vocoder.pt") + + +class MusicDCAE(nn.Module): + def __init__(self, pretrained_path=DEFAULT_PRETRAINED_PATH, encoder_only=False, source_sample_rate=None): + super(MusicDCAE, self).__init__() + dcae = AutoencoderDC.from_pretrained(pretrained_path) + self.encoder_only = encoder_only + + self.mel_transform = get_mel_transform() + if encoder_only: + self.encoder = dcae.encoder + else: + self.encoder = dcae.encoder + self.decoder = dcae.decoder + self.vocoder = ADaMoSHiFiGANV1(VOCODER_PRETRAINED_PATH).eval() + + if source_sample_rate is None: + source_sample_rate = 48000 + + self.resampler = torchaudio.transforms.Resample(source_sample_rate, 44100) + + self.transform = transforms.Compose([ + transforms.Normalize(0.5, 0.5), + ]) + self.min_mel_value = -11.0 + self.max_mel_value = 3.0 + self.audio_chunk_size = int(round((1024 * 512 / 44100 * 48000))) + self.mel_chunk_size = 1024 + self.time_dimention_multiple = 8 + self.latent_chunk_size = self.mel_chunk_size // self.time_dimention_multiple + self.scale_factor = 0.1786 + self.shift_factor = -1.9091 + + def load_audio(self, audio_path): + audio, sr = torchaudio.load(audio_path) + return audio, sr + + def forward_mel(self, audios): + mels = [] + for i in range(len(audios)): + image = self.mel_transform(audios[i]) + mels.append(image) + mels = torch.stack(mels) + return mels + + @torch.no_grad() + def encode(self, audios, audio_lengths=None, sr=None): + if audio_lengths is None: + audio_lengths = torch.tensor([audios.shape[2]] * audios.shape[0]) + audio_lengths = audio_lengths.to(audios.device) + + # audios: N x 2 x T, 48kHz + device = audios.device + dtype = audios.dtype + + if sr is None: + sr = 48000 + resampler = self.resampler + else: + resampler = torchaudio.transforms.Resample(sr, 44100).to(device).to(dtype) + + audio = resampler(audios) + + max_audio_len = audio.shape[-1] + if max_audio_len % (8 * 512) != 0: + audio = torch.nn.functional.pad(audio, (0, 8 * 512 - max_audio_len % (8 * 512))) + + mels = self.forward_mel(audio) + mels = (mels - self.min_mel_value) / (self.max_mel_value - self.min_mel_value) + mels = self.transform(mels) + latents = [] + for mel in mels: + latent = self.encoder(mel.unsqueeze(0)) + latents.append(latent) + latents = torch.cat(latents, dim=0) + latent_lengths = (audio_lengths / sr * 44100 / 512 / self.time_dimention_multiple).long() + latents = (latents - self.shift_factor) * self.scale_factor + return latents, latent_lengths + + @torch.no_grad() + def decode(self, latents, audio_lengths=None, sr=None): + latents = latents / self.scale_factor + self.shift_factor + + mels = [] + + for latent in latents: + mel = self.decoder(latent.unsqueeze(0)) + mels.append(mel) + mels = torch.cat(mels, dim=0) + + mels = mels * 0.5 + 0.5 + mels = mels * (self.max_mel_value - self.min_mel_value) + self.min_mel_value + bsz, channels, num_mel, mel_width = mels.shape + pred_wavs = [] + for i in range(bsz): + mel = mels[i] + wav = self.vocoder.decode(mel).squeeze(1) + pred_wavs.append(wav) + + pred_wavs = torch.stack(pred_wavs) + + if sr is not None: + resampler = torchaudio.transforms.Resample(44100, sr).to(latents.device).to(latents.dtype) + pred_wavs = [resampler(wav) for wav in pred_wavs] + else: + sr = 44100 + if audio_lengths is not None: + pred_wavs = [wav[:, :length].cpu() for wav, length in zip(pred_wavs, audio_lengths)] + return sr, pred_wavs + + def forward(self, audios, audio_lengths=None, sr=None): + latents, latent_lengths = self.encode(audios=audios, audio_lengths=audio_lengths, sr=sr) + sr, pred_wavs = self.decode(latents=latents, audio_lengths=audio_lengths, sr=sr) + return sr, pred_wavs, latents, latent_lengths + + +if __name__ == "__main__": + + audio, sr = torchaudio.load("/root/data/repo/gongjunmin/sag_train/orig2.wav") + audio_lengths = torch.tensor([audio.shape[1]]) + audios = audio.unsqueeze(0) + + # test encode only + model = MusicDCAE() + # latents, latent_lengths = model.encode(audios, audio_lengths) + # print("latents shape: ", latents.shape) + # print("latent_lengths: ", latent_lengths) + + # test encode and decode + sr, pred_wavs, latents, latent_lengths = model(audios, audio_lengths, sr) + print("reconstructed wavs: ", pred_wavs[0].shape) + print("latents shape: ", latents.shape) + print("latent_lengths: ", latent_lengths) + print("sr: ", sr) + torchaudio.save("/root/data/repo/gongjunmin/sag_train/reconstructed.wav", pred_wavs[0], sr) + print("reconstructed wav saved to /root/data/repo/gongjunmin/sag_train/reconstructed.wav") diff --git a/music_dcae/music_dcae_refiner.py b/music_dcae/music_dcae_refiner.py new file mode 100644 index 0000000..79eb617 --- /dev/null +++ b/music_dcae/music_dcae_refiner.py @@ -0,0 +1,551 @@ +from typing import Tuple, Union, Optional, Dict, Any +import torch +import torch.nn as nn +from diffusers.models.autoencoders.autoencoder_dc import DCUpBlock2d, get_block, RMSNorm, Decoder +from diffusers.models.transformers.sana_transformer import SanaTransformerBlock +from diffusers.models.embeddings import get_2d_sincos_pos_embed +from diffusers.models.normalization import AdaLayerNormSingle, RMSNorm +from diffusers.models.modeling_utils import ModelMixin +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.models.attention_processor import AttentionProcessor +from diffusers.models.modeling_outputs import Transformer2DModelOutput +from diffusers.utils import is_torch_version +from diffusers.models.unets import UNet2DModel + + +class Encoder(nn.Module): + + def __init__( + self, + in_channels: int = 32, + out_channels: int = 8, + attention_head_dim: int = 32, + block_out_channels: Tuple[int] = (512, 1024, 2048), + layers_per_block: Tuple[int] = (3, 3, 3), + block_type: str = "EfficientViTBlock", + norm_type: str = "rms_norm", + act_fn: str = "silu", + qkv_multiscales: tuple = (5,), + ): + super(Encoder, self).__init__() + + num_blocks = len(block_out_channels) + + self.dump_encoder = False + if num_blocks == 0: + self.dump_encoder = True + return + + self.conv_in = nn.Conv2d(in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1) + + up_blocks = [] + for i, (out_channel, num_layers) in reversed(list(enumerate(zip(block_out_channels, layers_per_block)))): + up_block_list = [] + + if i < num_blocks - 1 and num_layers > 0: + upsample_block = DCUpBlock2d( + block_out_channels[i + 1], + out_channel, + interpolate=True, + shortcut=True, + ) + up_block_list.append(upsample_block) + + for _ in range(num_layers): + block = get_block( + block_type, + out_channel, + out_channel, + attention_head_dim=attention_head_dim, + norm_type=norm_type, + act_fn=act_fn, + qkv_mutliscales=qkv_multiscales, + ) + up_block_list.append(block) + + up_blocks.insert(0, nn.Sequential(*up_block_list)) + + self.up_blocks = nn.ModuleList(up_blocks) + + self.norm_out = RMSNorm(block_out_channels[0], 1e-5, elementwise_affine=True, bias=True) + self.conv_act = nn.ReLU() + self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + if self.dump_encoder: + return hidden_states + + hidden_states = self.conv_in(hidden_states) + i = 0 + for up_block in reversed(self.up_blocks): + hidden_states = up_block(hidden_states) + i += 1 + + hidden_states = self.norm_out(hidden_states.movedim(1, -1)).movedim(-1, 1) + hidden_states = self.conv_act(hidden_states) + hidden_states = self.conv_out(hidden_states) + + return hidden_states + + +class PatchEmbed(nn.Module): + """ + 2D Image to Patch Embedding with support for SD3 cropping. + + Args: + height (`int`, defaults to `224`): The height of the image. + width (`int`, defaults to `224`): The width of the image. + patch_size (`int`, defaults to `16`): The size of the patches. + in_channels (`int`, defaults to `3`): The number of input channels. + embed_dim (`int`, defaults to `768`): The output dimension of the embedding. + layer_norm (`bool`, defaults to `False`): Whether or not to use layer normalization. + flatten (`bool`, defaults to `True`): Whether or not to flatten the output. + bias (`bool`, defaults to `True`): Whether or not to use bias. + interpolation_scale (`float`, defaults to `1`): The scale of the interpolation. + pos_embed_type (`str`, defaults to `"sincos"`): The type of positional embedding. + pos_embed_max_size (`int`, defaults to `None`): The maximum size of the positional embedding. + """ + + def __init__( + self, + height=16, + width=128, + patch_size=(16,1), + in_channels=16, + embed_dim=768, + layer_norm=False, + flatten=True, + bias=True, + interpolation_scale=1, + pos_embed_type="sincos", + pos_embed_max_size=None, # For SD3 cropping + ): + super().__init__() + + num_patches = (height // patch_size[0]) * (width // patch_size[1]) + self.flatten = flatten + self.layer_norm = layer_norm + self.pos_embed_max_size = pos_embed_max_size + + self.proj = nn.Conv2d( + in_channels, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias + ) + if layer_norm: + self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6) + else: + self.norm = None + + self.patch_size = patch_size + self.height, self.width = height // patch_size[0], width // patch_size[1] + self.base_size = height // patch_size[1] + self.interpolation_scale = interpolation_scale + + # Calculate positional embeddings based on max size or default + if pos_embed_max_size: + grid_size = pos_embed_max_size + else: + grid_size = int(num_patches**0.5) + + if pos_embed_type is None: + self.pos_embed = None + elif pos_embed_type == "sincos": + pos_embed = get_2d_sincos_pos_embed( + embed_dim, + grid_size, + base_size=self.base_size, + interpolation_scale=self.interpolation_scale, + output_type="pt", + ) + persistent = True if pos_embed_max_size else False + self.register_buffer("pos_embed", pos_embed.float().unsqueeze(0), persistent=persistent) + else: + raise ValueError(f"Unsupported pos_embed_type: {pos_embed_type}") + + def cropped_pos_embed(self, height, width): + """Crops positional embeddings for SD3 compatibility.""" + if self.pos_embed_max_size is None: + raise ValueError("`pos_embed_max_size` must be set for cropping.") + + height = height // self.patch_size + width = width // self.patch_size + if height > self.pos_embed_max_size: + raise ValueError( + f"Height ({height}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}." + ) + if width > self.pos_embed_max_size: + raise ValueError( + f"Width ({width}) cannot be greater than `pos_embed_max_size`: {self.pos_embed_max_size}." + ) + + top = (self.pos_embed_max_size - height) // 2 + left = (self.pos_embed_max_size - width) // 2 + spatial_pos_embed = self.pos_embed.reshape(1, self.pos_embed_max_size, self.pos_embed_max_size, -1) + spatial_pos_embed = spatial_pos_embed[:, top : top + height, left : left + width, :] + spatial_pos_embed = spatial_pos_embed.reshape(1, -1, spatial_pos_embed.shape[-1]) + return spatial_pos_embed + + def forward(self, latent): + if self.pos_embed_max_size is not None: + height, width = latent.shape[-2:] + else: + height, width = latent.shape[-2] // self.patch_size[0], latent.shape[-1] // self.patch_size[1] + latent = self.proj(latent) + if self.flatten: + latent = latent.flatten(2).transpose(1, 2) # BCHW -> BNC + if self.layer_norm: + latent = self.norm(latent) + if self.pos_embed is None: + return latent.to(latent.dtype) + # Interpolate or crop positional embeddings as needed + if self.pos_embed_max_size: + pos_embed = self.cropped_pos_embed(height, width) + else: + if self.height != height or self.width != width: + pos_embed = get_2d_sincos_pos_embed( + embed_dim=self.pos_embed.shape[-1], + grid_size=(height, width), + base_size=self.base_size, + interpolation_scale=self.interpolation_scale, + device=latent.device, + output_type="pt", + ) + pos_embed = pos_embed.float().unsqueeze(0) + else: + pos_embed = self.pos_embed + + return (latent + pos_embed).to(latent.dtype) + + +class DiTDecoder(ModelMixin, ConfigMixin): + + _supports_gradient_checkpointing = True + @register_to_config + + def __init__( + self, + sample_size: Tuple[int, int] = (16, 128), + in_channels: int = 16, + out_channels: int = 8, + patch_size: Tuple[int, int] = (16, 1), + inner_dim: int = 1152, + num_attention_heads: int = 36, + attention_head_dim: int = 32, + dropout: float = 0.0, + cross_attention_dim: Optional[int] = None, + num_cross_attention_heads: Optional[int] = None, + cross_attention_head_dim: Optional[int] = None, + attention_bias: bool = False, + norm_elementwise_affine: bool = False, + norm_eps: float = 1e-6, + interpolation_scale: int = 1, + mlp_ratio: float = 2.5, + num_layers: int = 12, + ): + super(DiTDecoder, self).__init__() + interpolation_scale = interpolation_scale if interpolation_scale is not None else max(sample_size // 64, 1) + self.interpolation_scale = interpolation_scale + + self.patch_embed = PatchEmbed( + height=sample_size[0], + width=sample_size[1], + patch_size=patch_size, + in_channels=in_channels, + embed_dim=inner_dim, + interpolation_scale=interpolation_scale, + ) + + self.time_embed = AdaLayerNormSingle(inner_dim) + + self.transformer_blocks = nn.ModuleList( + [ + SanaTransformerBlock( + inner_dim, + num_attention_heads, + attention_head_dim, + dropout=dropout, + num_cross_attention_heads=num_cross_attention_heads, + cross_attention_head_dim=cross_attention_head_dim, + cross_attention_dim=cross_attention_dim, + attention_bias=attention_bias, + norm_elementwise_affine=norm_elementwise_affine, + norm_eps=norm_eps, + mlp_ratio=mlp_ratio, + ) + for _ in range(num_layers) + ] + ) + + self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim ** 0.5) + self.norm_out = nn.LayerNorm(inner_dim, eps=1e-6, elementwise_affine=False) + self.proj_out = nn.Linear(inner_dim, patch_size[0] * patch_size[1] * out_channels) + self.gradient_checkpointing = False + + def _set_gradient_checkpointing(self, module, value=False): + if hasattr(module, "gradient_checkpointing"): + module.gradient_checkpointing = value + + @property + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors + def attn_processors(self) -> Dict[str, AttentionProcessor]: + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]): + if hasattr(module, "get_processor"): + processors[f"{name}.processor"] = module.get_processor() + + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + + return processors + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor + def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): + r""" + Sets the attention processor to use to compute attention. + + Parameters: + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + def forward( + self, + hidden_states: torch.Tensor, + timestep: Optional[int] = None, + return_dict: bool = True, + ): + + # 1. Input + batch_size, num_channels, height, width = hidden_states.shape + patch_size = self.config.patch_size + + post_patch_height, post_patch_width = height // patch_size[0], width // patch_size[1] + + hidden_states = self.patch_embed(hidden_states) + + timestep, embedded_timestep = self.time_embed( + timestep, batch_size=batch_size, hidden_dtype=hidden_states.dtype + ) + + # 2. Transformer blocks + if torch.is_grad_enabled() and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + + for block in self.transformer_blocks: + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(block), + hidden_states, + None, + None, + None, + timestep, + post_patch_height, + post_patch_width, + **ckpt_kwargs, + ) + + else: + for block in self.transformer_blocks: + hidden_states = block( + hidden_states, + None, + None, + None, + timestep, + post_patch_height, + post_patch_width, + ) + + # 3. Normalization + shift, scale = ( + self.scale_shift_table[None] + embedded_timestep[:, None].to(self.scale_shift_table.device) + ).chunk(2, dim=1) + + # 4. Modulation + hidden_states = hidden_states * (1 + scale) + shift + hidden_states = self.proj_out(hidden_states) + + # 5. Unpatchify + hidden_states = hidden_states.reshape( + batch_size, post_patch_height, post_patch_width, self.config.patch_size[0], self.config.patch_size[1], -1 + ) + hidden_states = hidden_states.permute(0, 5, 1, 3, 2, 4) + output = hidden_states.reshape(batch_size, -1, post_patch_height * patch_size[0], post_patch_width * patch_size[1]) + + if not return_dict: + return (output,) + + return Transformer2DModelOutput(sample=output) + + +class MusicDcaeRefiner(ModelMixin, ConfigMixin): + + @register_to_config + def __init__( + self, + in_channels: int = 32, + attention_head_dim: int = 32, + block_out_channels: Tuple[int] = (512, 1024, 2048), + layers_per_block: Tuple[int] = (3, 3, 3), + conv_block_out_channels: Tuple[int] = (224, 448, 672, 896), + out_channels: int = 8, + block_type: str = "EfficientViTBlock", + norm_type: str = "rms_norm", + act_fn: str = "silu", + qkv_multiscales: tuple = (5,), + sample_size: Tuple[int, int] = (16, 128), + patch_size: Tuple[int, int] = (16, 1), + inner_dim: int = 1152, + num_attention_heads: int = 36, + dropout: float = 0.0, + cross_attention_dim: Optional[int] = None, + num_cross_attention_heads: Optional[int] = None, + cross_attention_head_dim: Optional[int] = None, + attention_bias: bool = False, + norm_elementwise_affine: bool = False, + norm_eps: float = 1e-6, + interpolation_scale: int = 1, + mlp_ratio: float = 2.5, + num_layers: int = 12, + decoder_type: str = "ConvDecoder", + + ): + super(MusicDcaeRefiner, self).__init__() + + self.encoder = Encoder( + in_channels=in_channels, + out_channels=out_channels, + attention_head_dim=attention_head_dim, + block_out_channels=block_out_channels, + layers_per_block=layers_per_block, + block_type=block_type, + norm_type=norm_type, + act_fn=act_fn, + qkv_multiscales=qkv_multiscales, + ) + if decoder_type == "DiTDecoder": + self.decoder = DiTDecoder( + sample_size=sample_size, + in_channels=out_channels * 2, + out_channels=out_channels, + patch_size=patch_size, + inner_dim=inner_dim, + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + dropout=dropout, + cross_attention_dim=cross_attention_dim, + num_cross_attention_heads=num_cross_attention_heads, + cross_attention_head_dim=cross_attention_head_dim, + attention_bias=attention_bias, + norm_elementwise_affine=norm_elementwise_affine, + norm_eps=norm_eps, + interpolation_scale=interpolation_scale, + mlp_ratio=mlp_ratio, + num_layers=num_layers, + ) + else: + self.decoder = UNet2DModel( + sample_size=sample_size, + in_channels=out_channels * 2, + out_channels=out_channels, + block_out_channels=conv_block_out_channels, + ) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + timestep: Optional[int] = None, + return_dict: bool = True + ): + encoder_hidden_states = self.encoder(encoder_hidden_states) + hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1) + output = self.decoder(hidden_states, timestep=timestep, return_dict=return_dict) + return output + + + +if __name__ == "__main__": + # f32c32 -> f8c8 + # model = MusicDcaeRefiner() + + # x = torch.randn(1, 8, 16, 128) + # encoder_x = torch.randn(1, 32, 4, 32) + # timestep = 0 + # y = model(x, encoder_x, timestep=timestep) + # print("y", y.sample.shape) + # total_params = sum(p.numel() for p in model.parameters()) + # print(f"模型参数总数: {total_params / 1e6:.2f}M") + + # # 分别计算encoder和decoder的参数量 + # encoder_params_count = sum(p.numel() for p in model.encoder.parameters()) + # decoder_params_count = sum(p.numel() for p in model.decoder.parameters()) + # print(f"encoder参数量: {encoder_params_count/1e6:.2f}M") + # print(f"decoder参数量: {decoder_params_count/1e6:.2f}M") + + + # f8c8 -> mel + import json + with open("music_dcae/config_f8c8_to_mel_refiner.json", "r") as f: + config = json.load(f) + model = MusicDcaeRefiner(**config) + + x = torch.randn(1, 2, 128, 1024) + encoder_x = torch.randn(1, 2, 128, 1024) + timestep = 0 + y = model(x, encoder_x, timestep=timestep) + print("y", y.sample.shape) + total_params = sum(p.numel() for p in model.parameters()) + print(f"模型参数总数: {total_params / 1e6:.2f}M") + + # 分别计算encoder和decoder的参数量 + encoder_params_count = sum(p.numel() for p in model.encoder.parameters()) + decoder_params_count = sum(p.numel() for p in model.decoder.parameters()) + print(f"encoder参数量: {encoder_params_count/1e6:.2f}M") + print(f"decoder参数量: {decoder_params_count/1e6:.2f}M") diff --git a/music_dcae/music_dcae_vocoder.py b/music_dcae/music_dcae_vocoder.py new file mode 100644 index 0000000..e4b1047 --- /dev/null +++ b/music_dcae/music_dcae_vocoder.py @@ -0,0 +1,157 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from diffusers import AutoencoderDC +import json +import torchvision.transforms as transforms +import torchaudio + +try: + from .music_vocoder import ADaMoSHiFiGANV1 +except ImportError: + from music_vocoder import ADaMoSHiFiGANV1 + + +DEFAULT_CONFIG_PATH = "/root/sag_train/music_dcae/config_f32c32_large.json" +DCAE_PRETRAINED_PATH = "/root/sag_train/checkpoints/music_dcae_f32c32" +VOCODER_PRETRAINED_PATH = "/root/sag_train/checkpoints/music_vocoder.pt" + + +class MusicDCAEVocoder(nn.Module): + def __init__(self, config_path=DEFAULT_CONFIG_PATH, pretrained_path=DCAE_PRETRAINED_PATH): + super(MusicDCAEVocoder, self).__init__() + if pretrained_path is None: + with open(config_path) as f: + config = json.load(f) + self.dcae = AutoencoderDC(**config) + else: + self.dcae = AutoencoderDC.from_pretrained(pretrained_path) + self.vocoder = ADaMoSHiFiGANV1(VOCODER_PRETRAINED_PATH) + self.freeze_vocoder() + self.transform = transforms.Compose([ + transforms.Normalize(0.5, 0.5), + ]) + self.min_mel_value = -11.0 + self.max_mel_value = 3.0 + self.target_sr = 44100 + + def load_audio(self, audio_path): + audio, sr = torchaudio.load(audio_path) + if audio.shape[0] == 1: + audio = torch.cat([audio, audio], dim=0) + return audio, sr + + def resample_audio(self, audio, sr=48000): + resampler = torchaudio.transforms.Resample(sr, self.target_sr) + resampler = resampler.to(audio.device) + audio = resampler(audio) + return audio + + def forward_mel(self, audios): + mels = [] + for i in range(len(audios)): + image = self.vocoder.mel_transform(audios[i]) + mels.append(image) + mels = torch.stack(mels) + return mels + + def norm_mel(self, mels): + normed_mels = (mels - self.min_mel_value) / (self.max_mel_value - self.min_mel_value) + normed_mels = self.transform(normed_mels) + return normed_mels + + def denorm_mel(self, normed_mels): + mels = normed_mels * 0.5 + 0.5 + mels = mels * (self.max_mel_value - self.min_mel_value) + self.min_mel_value + return mels + + def encode_latent(self, normed_mels): + # N x 2 x 128 x W -> N x C x 128//F x W//F + latent = self.dcae.encode(normed_mels).latent + return latent + + def decode_mel(self, latent): + # N x C x 128//F x W//F -> N x 2 x 128 x W + normed_mels = self.dcae.decode(latent).sample + return normed_mels + + def decode_audio(self, mels): + # mels: N x 2 x 128 x W -> 2N x 128 x W + bs = mels.shape[0] + mono_mels = mels.reshape(-1, 128, mels.shape[-1]) + mono_audios = self.vocoder(mono_mels) + audios = mono_audios.reshape(bs, 2, -1) + return audios + + def encode(self, audios): + mels = self.forward_mel(audios) + normed_mels = self.norm_mel(mels) + latent = self.encode_latent(normed_mels) + return latent, mels + + def decode(self, latent): + recon_normed_mels = self.decode_mel(latent) + recon_mels = self.denorm_mel(recon_normed_mels) + recon_audios = self.decode_audio(recon_mels) + return recon_audios, recon_mels + + def forward(self, audios): + audios_len = audios.shape[-1] + latent, mels = self.encode(audios) + recon_audios, recon_mels = self.decode(latent) + if recon_audios.shape[-1] > audios_len: + recon_audios = recon_audios[:, :, :audios_len] + elif recon_audios.shape[-1] < audios_len: + recon_audios = F.pad(recon_audios, (0, audios_len - recon_audios.shape[-1])) + return recon_audios, mels, recon_mels, latent + + def freeze_vocoder(self): + self.vocoder.eval() + self.vocoder.requires_grad_(False) + + def unfreeze_vocoder(self): + self.vocoder.train() + self.vocoder.requires_grad_(True) + + def return_middle_layers(self): + last_down_block = self.dcae.encoder.down_blocks[-1] + encoder_conv_out = self.dcae.encoder.conv_out + decoder_conv_in = self.dcae.decoder.conv_in + decoder_up_blocks = self.dcae.decoder.up_blocks[0] + middle_layers = [last_down_block, encoder_conv_out, decoder_conv_in, decoder_up_blocks] + return middle_layers + + def return_head_layers(self): + decoder_up_blocks = self.dcae.decoder.up_blocks[-1] + conv_out = self.dcae.decoder.conv_out + head_layers = [decoder_up_blocks, conv_out] + return head_layers + + +if __name__ == "__main__": + model = MusicDCAEVocoder() + + audio_path = "/root/sag_train/orig2.wav" + audio, sr = model.load_audio(audio_path) + audio = model.resample_audio(audio, sr) + + model.eval() + model = model.to("cuda:0") + audio = audio.to("cuda:0") + with torch.no_grad(): + audios_len = audio.shape[-1] + min_frame = 512 * 32 + if audios_len % min_frame != 0: + padding = torch.zeros(audio.shape[0], 2, min_frame - audios_len % min_frame).to(audios.device) + audios = torch.cat([audio, padding], dim=-1) + recon_audios, mels, recon_mels, latent = model(audio.unsqueeze(0)) + recon_audios = recon_audios[:, :, :audios_len] + + print("latent shape: ", latent.shape) + print("recon_audios", recon_audios.shape) + print("mels", mels.shape, "min:", mels.min(), "max:", mels.max(), "mean:", mels.mean(), "std:", mels.std()) + print("recon_mels", recon_mels.shape, "min:", recon_mels.min(), "max:", recon_mels.max(), "mean:", recon_mels.mean(), "std:", recon_mels.std()) + total_params = sum(p.numel() for p in model.parameters()) + print(f"模型参数总数: {total_params / 1e6:.2f}M") + + torchaudio.save("/root/sag_train/recon2.wav", recon_audios[0].cpu(), 44100) diff --git a/music_dcae/music_log_mel.py b/music_dcae/music_log_mel.py new file mode 100755 index 0000000..dbf9deb --- /dev/null +++ b/music_dcae/music_log_mel.py @@ -0,0 +1,119 @@ +import torch +import torch.nn as nn +from torch import Tensor +from torchaudio.transforms import MelScale + + +class LinearSpectrogram(nn.Module): + def __init__( + self, + n_fft=2048, + win_length=2048, + hop_length=512, + center=False, + mode="pow2_sqrt", + ): + super().__init__() + + self.n_fft = n_fft + self.win_length = win_length + self.hop_length = hop_length + self.center = center + self.mode = mode + + self.register_buffer("window", torch.hann_window(win_length)) + + def forward(self, y: Tensor) -> Tensor: + if y.ndim == 3: + y = y.squeeze(1) + + y = torch.nn.functional.pad( + y.unsqueeze(1), + ( + (self.win_length - self.hop_length) // 2, + (self.win_length - self.hop_length + 1) // 2, + ), + mode="reflect", + ).squeeze(1) + dtype = y.dtype + spec = torch.stft( + y.float(), + self.n_fft, + hop_length=self.hop_length, + win_length=self.win_length, + window=self.window, + center=self.center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + spec = torch.view_as_real(spec) + + if self.mode == "pow2_sqrt": + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + spec = spec.to(dtype) + return spec + + +class LogMelSpectrogram(nn.Module): + def __init__( + self, + sample_rate=44100, + n_fft=2048, + win_length=2048, + hop_length=512, + n_mels=128, + center=False, + f_min=0.0, + f_max=None, + ): + super().__init__() + + self.sample_rate = sample_rate + self.n_fft = n_fft + self.win_length = win_length + self.hop_length = hop_length + self.center = center + self.n_mels = n_mels + self.f_min = f_min + self.f_max = f_max or sample_rate // 2 + + self.spectrogram = LinearSpectrogram(n_fft, win_length, hop_length, center) + self.mel_scale = MelScale( + self.n_mels, + self.sample_rate, + self.f_min, + self.f_max, + self.n_fft // 2 + 1, + "slaney", + "slaney", + ) + + def compress(self, x: Tensor) -> Tensor: + return torch.log(torch.clamp(x, min=1e-5)) + + def decompress(self, x: Tensor) -> Tensor: + return torch.exp(x) + + def forward(self, x: Tensor, return_linear: bool = False) -> Tensor: + linear = self.spectrogram(x) + x = self.mel_scale(linear) + x = self.compress(x) + # print(x.shape) + if return_linear: + return x, self.compress(linear) + + return x + + +def get_mel_transform(): + return LogMelSpectrogram( + sample_rate=44100, + n_fft=2048, + win_length=2048, + hop_length=512, + f_min=40, + f_max=16000, + n_mels=128, + ) diff --git a/music_dcae/music_vocoder.py b/music_dcae/music_vocoder.py new file mode 100755 index 0000000..4234351 --- /dev/null +++ b/music_dcae/music_vocoder.py @@ -0,0 +1,565 @@ +import librosa +import torch +from torch import nn + +from functools import partial +from math import prod +from typing import Callable, Tuple, List + +import numpy as np +import torch.nn.functional as F +from torch.nn import Conv1d +from torch.nn.utils import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations as remove_weight_norm + +try: + from music_log_mel import LogMelSpectrogram +except ImportError: + from .music_log_mel import LogMelSpectrogram + + +def drop_path( + x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True +): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + + """ # noqa: E501 + + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * ( + x.ndim - 1 + ) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" # noqa: E501 + + def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) + + def extra_repr(self): + return f"drop_prob={round(self.drop_prob,3):0.3f}" + + +class LayerNorm(nn.Module): + r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. + The ordering of the dimensions in the inputs. channels_last corresponds to inputs with + shape (batch_size, height, width, channels) while channels_first corresponds to inputs + with shape (batch_size, channels, height, width). + """ # noqa: E501 + + def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): + super().__init__() + self.weight = nn.Parameter(torch.ones(normalized_shape)) + self.bias = nn.Parameter(torch.zeros(normalized_shape)) + self.eps = eps + self.data_format = data_format + if self.data_format not in ["channels_last", "channels_first"]: + raise NotImplementedError + self.normalized_shape = (normalized_shape,) + + def forward(self, x): + if self.data_format == "channels_last": + return F.layer_norm( + x, self.normalized_shape, self.weight, self.bias, self.eps + ) + elif self.data_format == "channels_first": + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None] * x + self.bias[:, None] + return x + + +class ConvNeXtBlock(nn.Module): + r"""ConvNeXt Block. There are two equivalent implementations: + (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) + (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back + We use (2) as we find it slightly faster in PyTorch + + Args: + dim (int): Number of input channels. + drop_path (float): Stochastic depth rate. Default: 0.0 + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0. + kernel_size (int): Kernel size for depthwise conv. Default: 7. + dilation (int): Dilation for depthwise conv. Default: 1. + """ # noqa: E501 + + def __init__( + self, + dim: int, + drop_path: float = 0.0, + layer_scale_init_value: float = 1e-6, + mlp_ratio: float = 4.0, + kernel_size: int = 7, + dilation: int = 1, + ): + super().__init__() + + self.dwconv = nn.Conv1d( + dim, + dim, + kernel_size=kernel_size, + padding=int(dilation * (kernel_size - 1) / 2), + groups=dim, + ) # depthwise conv + self.norm = LayerNorm(dim, eps=1e-6) + self.pwconv1 = nn.Linear( + dim, int(mlp_ratio * dim) + ) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.pwconv2 = nn.Linear(int(mlp_ratio * dim), dim) + self.gamma = ( + nn.Parameter(layer_scale_init_value * + torch.ones((dim)), requires_grad=True) + if layer_scale_init_value > 0 + else None + ) + self.drop_path = DropPath( + drop_path) if drop_path > 0.0 else nn.Identity() + + def forward(self, x, apply_residual: bool = True): + input = x + + x = self.dwconv(x) + x = x.permute(0, 2, 1) # (N, C, L) -> (N, L, C) + x = self.norm(x) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + + if self.gamma is not None: + x = self.gamma * x + + x = x.permute(0, 2, 1) # (N, L, C) -> (N, C, L) + x = self.drop_path(x) + + if apply_residual: + x = input + x + + return x + + +class ParallelConvNeXtBlock(nn.Module): + def __init__(self, kernel_sizes: List[int], *args, **kwargs): + super().__init__() + self.blocks = nn.ModuleList( + [ + ConvNeXtBlock(kernel_size=kernel_size, *args, **kwargs) + for kernel_size in kernel_sizes + ] + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.stack( + [block(x, apply_residual=False) for block in self.blocks] + [x], + dim=1, + ).sum(dim=1) + + +class ConvNeXtEncoder(nn.Module): + def __init__( + self, + input_channels=3, + depths=[3, 3, 9, 3], + dims=[96, 192, 384, 768], + drop_path_rate=0.0, + layer_scale_init_value=1e-6, + kernel_sizes: Tuple[int] = (7,), + ): + super().__init__() + assert len(depths) == len(dims) + + self.channel_layers = nn.ModuleList() + stem = nn.Sequential( + nn.Conv1d( + input_channels, + dims[0], + kernel_size=7, + padding=3, + padding_mode="replicate", + ), + LayerNorm(dims[0], eps=1e-6, data_format="channels_first"), + ) + self.channel_layers.append(stem) + + for i in range(len(depths) - 1): + mid_layer = nn.Sequential( + LayerNorm(dims[i], eps=1e-6, data_format="channels_first"), + nn.Conv1d(dims[i], dims[i + 1], kernel_size=1), + ) + self.channel_layers.append(mid_layer) + + block_fn = ( + partial(ConvNeXtBlock, kernel_size=kernel_sizes[0]) + if len(kernel_sizes) == 1 + else partial(ParallelConvNeXtBlock, kernel_sizes=kernel_sizes) + ) + + self.stages = nn.ModuleList() + drop_path_rates = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + ] + + cur = 0 + for i in range(len(depths)): + stage = nn.Sequential( + *[ + block_fn( + dim=dims[i], + drop_path=drop_path_rates[cur + j], + layer_scale_init_value=layer_scale_init_value, + ) + for j in range(depths[i]) + ] + ) + self.stages.append(stage) + cur += depths[i] + + self.norm = LayerNorm(dims[-1], eps=1e-6, data_format="channels_first") + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, (nn.Conv1d, nn.Linear)): + nn.init.trunc_normal_(m.weight, std=0.02) + nn.init.constant_(m.bias, 0) + + def forward( + self, + x: torch.Tensor, + ) -> torch.Tensor: + for channel_layer, stage in zip(self.channel_layers, self.stages): + x = channel_layer(x) + x = stage(x) + + return self.norm(x) + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return (kernel_size * dilation - dilation) // 2 + + +class ResBlock1(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super().__init__() + + self.convs1 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) + self.convs2.apply(init_weights) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.silu(x) + xt = c1(xt) + xt = F.silu(xt) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for conv in self.convs1: + remove_weight_norm(conv) + for conv in self.convs2: + remove_weight_norm(conv) + + +class HiFiGANGenerator(nn.Module): + def __init__( + self, + *, + hop_length: int = 512, + upsample_rates: Tuple[int] = (8, 8, 2, 2, 2), + upsample_kernel_sizes: Tuple[int] = (16, 16, 8, 2, 2), + resblock_kernel_sizes: Tuple[int] = (3, 7, 11), + resblock_dilation_sizes: Tuple[Tuple[int]] = ( + (1, 3, 5), (1, 3, 5), (1, 3, 5)), + num_mels: int = 128, + upsample_initial_channel: int = 512, + use_template: bool = True, + pre_conv_kernel_size: int = 7, + post_conv_kernel_size: int = 7, + post_activation: Callable = partial(nn.SiLU, inplace=True), + ): + super().__init__() + + assert ( + prod(upsample_rates) == hop_length + ), f"hop_length must be {prod(upsample_rates)}" + + self.conv_pre = weight_norm( + nn.Conv1d( + num_mels, + upsample_initial_channel, + pre_conv_kernel_size, + 1, + padding=get_padding(pre_conv_kernel_size), + ) + ) + + self.num_upsamples = len(upsample_rates) + self.num_kernels = len(resblock_kernel_sizes) + + self.noise_convs = nn.ModuleList() + self.use_template = use_template + self.ups = nn.ModuleList() + + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + self.ups.append( + weight_norm( + nn.ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + if not use_template: + continue + + if i + 1 < len(upsample_rates): + stride_f0 = np.prod(upsample_rates[i + 1:]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes): + self.resblocks.append(ResBlock1(ch, k, d)) + + self.activation_post = post_activation() + self.conv_post = weight_norm( + nn.Conv1d( + ch, + 1, + post_conv_kernel_size, + 1, + padding=get_padding(post_conv_kernel_size), + ) + ) + self.ups.apply(init_weights) + self.conv_post.apply(init_weights) + + def forward(self, x, template=None): + x = self.conv_pre(x) + + for i in range(self.num_upsamples): + x = F.silu(x, inplace=True) + x = self.ups[i](x) + + if self.use_template: + x = x + self.noise_convs[i](template) + + xs = None + + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + + x = xs / self.num_kernels + + x = self.activation_post(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + for up in self.ups: + remove_weight_norm(up) + for block in self.resblocks: + block.remove_weight_norm() + remove_weight_norm(self.conv_pre) + remove_weight_norm(self.conv_post) + + +class ADaMoSHiFiGANV1(nn.Module): + def __init__( + self, + checkpoint_path: str = "checkpoints/adamos-generator-1640000.pth", + ): + super().__init__() + + self.backbone = ConvNeXtEncoder( + input_channels=128, + depths=[3, 3, 9, 3], + dims=[128, 256, 384, 512], + drop_path_rate=0, + kernel_sizes=(7,), + ) + + self.head = HiFiGANGenerator( + hop_length=512, + upsample_rates=(4, 4, 2, 2, 2, 2, 2), + upsample_kernel_sizes=(8, 8, 4, 4, 4, 4, 4), + resblock_kernel_sizes=(3, 7, 11, 13), + resblock_dilation_sizes=( + (1, 3, 5), (1, 3, 5), (1, 3, 5), (1, 3, 5)), + num_mels=512, + upsample_initial_channel=1024, + use_template=False, + pre_conv_kernel_size=13, + post_conv_kernel_size=13, + ) + self.sampling_rate = 44100 + + ckpt_state = torch.load(checkpoint_path, map_location="cpu") + + if "state_dict" in ckpt_state: + ckpt_state = ckpt_state["state_dict"] + + if any(k.startswith("generator.") for k in ckpt_state): + ckpt_state = { + k.replace("generator.", ""): v + for k, v in ckpt_state.items() + if k.startswith("generator.") + } + + self.load_state_dict(ckpt_state) + self.eval() + + self.mel_transform = LogMelSpectrogram( + sample_rate=44100, + n_fft=2048, + win_length=2048, + hop_length=512, + f_min=40, + f_max=16000, + n_mels=128, + ) + + @torch.no_grad() + def decode(self, mel): + y = self.backbone(mel) + y = self.head(y) + return y + + @torch.no_grad() + def encode(self, x): + return self.mel_transform(x) + + def forward(self, mel): + y = self.backbone(mel) + y = self.head(y) + return y + + +if __name__ == "__main__": + import soundfile as sf + + x = "./test.wav" + model = ADaMoSHiFiGANV1(checkpoint_path='./step_001640000.pth') + + wav, sr = librosa.load(x, sr=44100, mono=True) + wav = torch.from_numpy(wav).float()[None] + mel = model.encode(wav) + + wav = model.decode(mel)[0].mT + sf.write("test_out.wav", wav.cpu().numpy(), 44100) diff --git a/optimizers/cosine_wsd.py b/optimizers/cosine_wsd.py new file mode 100644 index 0000000..8fdf078 --- /dev/null +++ b/optimizers/cosine_wsd.py @@ -0,0 +1,39 @@ +from torch.optim.lr_scheduler import _LRScheduler +import torch + + +class CosineWSD(_LRScheduler): + def __init__(self, optimizer, warmup_iters, step_size, decay_length, decay_interval, eta_min=0, last_epoch=-1): + self.warmup_iters = warmup_iters + self.step_size = step_size + self.decay_length = decay_length + self.decay_interval = decay_interval + self.eta_min = eta_min + super(CosineWSD, self).__init__(optimizer, last_epoch) + + def get_lr(self): + if self.last_epoch < self.warmup_iters: + lr = [(base_lr * self.last_epoch / self.warmup_iters) for base_lr in self.base_lrs] + elif self.last_epoch < self.step_size: + lr = [base_lr for base_lr in self.base_lrs] + elif self.last_epoch <= self.step_size + self.decay_length: + lr = [(base_lr * (0.5 ** ((self.last_epoch - self.step_size) / self.decay_interval))) + for base_lr in self.base_lrs] + else: + lr = [self.eta_min for base_lr in self.base_lrs] + return lr + + +def configure_lr_scheduler(optimizer, total_steps_per_epoch, epochs=10, decay_ratio=0.9, decay_interval=1000, warmup_iters=4000): + total_steps = total_steps_per_epoch * epochs + step_size = total_steps * decay_ratio + decay_length = total_steps - step_size + decay_interval = decay_interval + lr_scheduler = CosineWSD( + optimizer, + warmup_iters=warmup_iters, + step_size=step_size, + decay_length=decay_length, + decay_interval=decay_interval + ) + return [{"scheduler": lr_scheduler, "name": "CosineWSD", "interval": "step"}] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2825a4e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,19 @@ +datasets==3.4.1 +diffusers==0.32.2 +gradio==5.23.3 +librosa==0.11.0 +loguru==0.7.3 +matplotlib==3.10.1 +numpy +pypinyin==0.53.0 +pytorch_lightning==2.5.1 +soundfile==0.13.1 +torch +torchaudio +torchvision +tqdm==4.67.1 +transformers==4.50.0 +py3langid==0.3.0 +hangul-romanize==0.1.0 +num2words==0.5.14 +spacy==3.8.4 \ No newline at end of file diff --git a/schedulers/scheduling_flow_match_euler_discrete.py b/schedulers/scheduling_flow_match_euler_discrete.py new file mode 100644 index 0000000..997d8ee --- /dev/null +++ b/schedulers/scheduling_flow_match_euler_discrete.py @@ -0,0 +1,394 @@ +# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.utils import BaseOutput, logging +from diffusers.schedulers.scheduling_utils import SchedulerMixin + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput): + """ + Output class for the scheduler's `step` function output. + + Args: + prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + """ + + prev_sample: torch.FloatTensor + + +class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin): + """ + Euler scheduler. + + This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic + methods the library implements for all schedulers such as loading and saving. + + Args: + num_train_timesteps (`int`, defaults to 1000): + The number of diffusion steps to train the model. + timestep_spacing (`str`, defaults to `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information. + shift (`float`, defaults to 1.0): + The shift value for the timestep schedule. + """ + + _compatibles = [] + order = 1 + + @register_to_config + def __init__( + self, + num_train_timesteps: int = 1000, + shift: float = 1.0, + use_dynamic_shifting=False, + base_shift: Optional[float] = 0.5, + max_shift: Optional[float] = 1.15, + base_image_seq_len: Optional[int] = 256, + max_image_seq_len: Optional[int] = 4096, + ): + timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy() + timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32) + + sigmas = timesteps / num_train_timesteps + if not use_dynamic_shifting: + # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution + sigmas = shift * sigmas / (1 + (shift - 1) * sigmas) + + self.timesteps = sigmas * num_train_timesteps + + self._step_index = None + self._begin_index = None + + self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication + self.sigma_min = self.sigmas[-1].item() + self.sigma_max = self.sigmas[0].item() + + @property + def step_index(self): + """ + The index counter for current timestep. It will increase 1 after each scheduler step. + """ + return self._step_index + + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + + def scale_noise( + self, + sample: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + noise: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + """ + Forward process in flow-matching + + Args: + sample (`torch.FloatTensor`): + The input sample. + timestep (`int`, *optional*): + The current timestep in the diffusion chain. + + Returns: + `torch.FloatTensor`: + A scaled input sample. + """ + # Make sure sigmas and timesteps have the same device and dtype as original_samples + sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype) + + if sample.device.type == "mps" and torch.is_floating_point(timestep): + # mps does not support float64 + schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32) + timestep = timestep.to(sample.device, dtype=torch.float32) + else: + schedule_timesteps = self.timesteps.to(sample.device) + timestep = timestep.to(sample.device) + + # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep] + elif self.step_index is not None: + # add_noise is called after first denoising step (for inpainting) + step_indices = [self.step_index] * timestep.shape[0] + else: + # add noise is called before first denoising step to create initial latent(img2img) + step_indices = [self.begin_index] * timestep.shape[0] + + sigma = sigmas[step_indices].flatten() + while len(sigma.shape) < len(sample.shape): + sigma = sigma.unsqueeze(-1) + + sample = sigma * noise + (1.0 - sigma) * sample + + return sample + + def _sigma_to_t(self, sigma): + return sigma * self.config.num_train_timesteps + + def time_shift(self, mu: float, sigma: float, t: torch.Tensor): + return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma) + + def set_timesteps( + self, + num_inference_steps: int = None, + device: Union[str, torch.device] = None, + sigmas: Optional[List[float]] = None, + mu: Optional[float] = None, + ): + """ + Sets the discrete timesteps used for the diffusion chain (to be run before inference). + + Args: + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + """ + + if self.config.use_dynamic_shifting and mu is None: + raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`") + + if sigmas is None: + self.num_inference_steps = num_inference_steps + timesteps = np.linspace( + self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps + ) + + sigmas = timesteps / self.config.num_train_timesteps + + if self.config.use_dynamic_shifting: + sigmas = self.time_shift(mu, 1.0, sigmas) + else: + sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas) + + sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) + timesteps = sigmas * self.config.num_train_timesteps + + self.timesteps = timesteps.to(device=device) + self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)]) + + self._step_index = None + self._begin_index = None + + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps + + indices = (schedule_timesteps == timestep).nonzero() + + # The sigma index that is taken for the **very** first `step` + # is always the second index (or the last index if there is only 1) + # This way we can ensure we don't accidentally skip a sigma in + # case we start in the middle of the denoising schedule (e.g. for image-to-image) + pos = 1 if len(indices) > 1 else 0 + + return indices[pos].item() + + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index + + def step( + self, + model_output: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + sample: torch.FloatTensor, + s_churn: float = 0.0, + s_tmin: float = 0.0, + s_tmax: float = float("inf"), + s_noise: float = 1.0, + generator: Optional[torch.Generator] = None, + return_dict: bool = True, + omega: Union[float, np.array] = 0.0 + ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]: + """ + Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`torch.FloatTensor`): + The direct output from learned diffusion model. + timestep (`float`): + The current discrete timestep in the diffusion chain. + sample (`torch.FloatTensor`): + A current instance of a sample created by the diffusion process. + s_churn (`float`): + s_tmin (`float`): + s_tmax (`float`): + s_noise (`float`, defaults to 1.0): + Scaling factor for noise added to the sample. + generator (`torch.Generator`, *optional*): + A random number generator. + return_dict (`bool`): + Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or + tuple. + + Returns: + [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`: + If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is + returned, otherwise a tuple is returned where the first element is the sample tensor. + """ + + def logistic_function(x, L=0.9, U=1.1, x_0=0.0, k=1): + # L = Lower bound + # U = Upper bound + # x_0 = Midpoint (x corresponding to y = 1.0) + # k = Steepness, can adjust based on preference + + if isinstance(x, torch.Tensor): + device_ = x.device + x = x.to(torch.float).cpu().numpy() + + new_x = L + (U - L) / (1 + np.exp(-k * (x - x_0))) + + if isinstance(new_x, np.ndarray): + new_x = torch.from_numpy(new_x).to(device_) + return new_x + + self.omega_bef_rescale = omega + omega = logistic_function(omega, k=0.1) + self.omega_aft_rescale = omega + + if ( + isinstance(timestep, int) + or isinstance(timestep, torch.IntTensor) + or isinstance(timestep, torch.LongTensor) + ): + raise ValueError( + ( + "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" + " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass" + " one of the `scheduler.timesteps` as a timestep." + ), + ) + + if self.step_index is None: + self._init_step_index(timestep) + + # Upcast to avoid precision issues when computing prev_sample + sample = sample.to(torch.float32) + + sigma = self.sigmas[self.step_index] + sigma_next = self.sigmas[self.step_index + 1] + + ## -- + ## mean shift 1 + dx = (sigma_next - sigma) * model_output + m = dx.mean() + # print(dx.shape) # torch.Size([1, 16, 128, 128]) + # print(f'm: {m}') # m: -0.0014209747314453125 + # raise NotImplementedError + dx_ = (dx - m) * omega + m + prev_sample = sample + dx_ + + # ## -- + # ## mean shift 2 + # m = model_output.mean() + # model_output_ = (model_output - m) * omega + m + # prev_sample = sample + (sigma_next - sigma) * model_output_ + + # ## -- + # ## original + # prev_sample = sample + (sigma_next - sigma) * model_output * omega + + # ## -- + # ## spatial mean 1 + # dx = (sigma_next - sigma) * model_output + # m = dx.mean(dim=(0, 1), keepdim=True) + # # print(dx.shape) # torch.Size([1, 16, 128, 128]) + # # print(m.shape) # torch.Size([1, 1, 128, 128]) + # # raise NotImplementedError + # dx_ = (dx - m) * omega + m + # prev_sample = sample + dx_ + + # ## -- + # ## spatial mean 2 + # m = model_output.mean(dim=(0, 1), keepdim=True) + # model_output_ = (model_output - m) * omega + m + # prev_sample = sample + (sigma_next - sigma) * model_output_ + + # ## -- + # ## channel mean 1 + # m = model_output.mean(dim=(2, 3), keepdim=True) + # # print(m.shape) # torch.Size([1, 16, 1, 1]) + # model_output_ = (model_output - m) * omega + m + # prev_sample = sample + (sigma_next - sigma) * model_output_ + + # ## -- + # ## channel mean 2 + # dx = (sigma_next - sigma) * model_output + # m = dx.mean(dim=(2, 3), keepdim=True) + # # print(m.shape) # torch.Size([1, 16, 1, 1]) + # dx_ = (dx - m) * omega + m + # prev_sample = sample + dx_ + + # ## -- + # ## keep sample mean + # m_tgt = sample.mean() + # prev_sample_ = sample + (sigma_next - sigma) * model_output * omega + # m_src = prev_sample_.mean() + # prev_sample = prev_sample_ - m_src + m_tgt + + # ## -- + # ## test + # # print(sample.mean()) + # prev_sample = sample + (sigma_next - sigma) * model_output * omega + # # raise NotImplementedError + + # Cast sample back to model compatible dtype + prev_sample = prev_sample.to(model_output.dtype) + + # upon completion increase step index by one + self._step_index += 1 + + if not return_dict: + return (prev_sample,) + + return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample) + + def __len__(self): + return self.config.num_train_timesteps diff --git a/schedulers/scheduling_flow_match_heun_discrete.py b/schedulers/scheduling_flow_match_heun_discrete.py new file mode 100644 index 0000000..b459a00 --- /dev/null +++ b/schedulers/scheduling_flow_match_heun_discrete.py @@ -0,0 +1,348 @@ +# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import numpy as np +import torch + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.utils import BaseOutput, logging +from diffusers.utils.torch_utils import randn_tensor +from diffusers.schedulers.scheduling_utils import SchedulerMixin + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class FlowMatchHeunDiscreteSchedulerOutput(BaseOutput): + """ + Output class for the scheduler's `step` function output. + + Args: + prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + """ + + prev_sample: torch.FloatTensor + + +class FlowMatchHeunDiscreteScheduler(SchedulerMixin, ConfigMixin): + """ + Heun scheduler. + + This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic + methods the library implements for all schedulers such as loading and saving. + + Args: + num_train_timesteps (`int`, defaults to 1000): + The number of diffusion steps to train the model. + timestep_spacing (`str`, defaults to `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information. + shift (`float`, defaults to 1.0): + The shift value for the timestep schedule. + """ + + _compatibles = [] + order = 2 + + @register_to_config + def __init__( + self, + num_train_timesteps: int = 1000, + shift: float = 1.0, + ): + timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32)[::-1].copy() + timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32) + + sigmas = timesteps / num_train_timesteps + sigmas = shift * sigmas / (1 + (shift - 1) * sigmas) + + self.timesteps = sigmas * num_train_timesteps + + self._step_index = None + self._begin_index = None + + self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication + self.sigma_min = self.sigmas[-1].item() + self.sigma_max = self.sigmas[0].item() + + @property + def step_index(self): + """ + The index counter for current timestep. It will increase 1 after each scheduler step. + """ + return self._step_index + + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + + def scale_noise( + self, + sample: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + noise: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + """ + Forward process in flow-matching + + Args: + sample (`torch.FloatTensor`): + The input sample. + timestep (`int`, *optional*): + The current timestep in the diffusion chain. + + Returns: + `torch.FloatTensor`: + A scaled input sample. + """ + if self.step_index is None: + self._init_step_index(timestep) + + sigma = self.sigmas[self.step_index] + sample = sigma * noise + (1.0 - sigma) * sample + + return sample + + def _sigma_to_t(self, sigma): + return sigma * self.config.num_train_timesteps + + def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): + """ + Sets the discrete timesteps used for the diffusion chain (to be run before inference). + + Args: + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + """ + self.num_inference_steps = num_inference_steps + + timesteps = np.linspace( + self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps + ) + + sigmas = timesteps / self.config.num_train_timesteps + sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas) + sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) + + timesteps = sigmas * self.config.num_train_timesteps + timesteps = torch.cat([timesteps[:1], timesteps[1:].repeat_interleave(2)]) + self.timesteps = timesteps.to(device=device) + + sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)]) + self.sigmas = torch.cat([sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]]) + + # empty dt and derivative + self.prev_derivative = None + self.dt = None + + self._step_index = None + self._begin_index = None + + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps + + indices = (schedule_timesteps == timestep).nonzero() + + # The sigma index that is taken for the **very** first `step` + # is always the second index (or the last index if there is only 1) + # This way we can ensure we don't accidentally skip a sigma in + # case we start in the middle of the denoising schedule (e.g. for image-to-image) + pos = 1 if len(indices) > 1 else 0 + + return indices[pos].item() + + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index + + @property + def state_in_first_order(self): + return self.dt is None + + def step( + self, + model_output: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + sample: torch.FloatTensor, + s_churn: float = 0.0, + s_tmin: float = 0.0, + s_tmax: float = float("inf"), + s_noise: float = 1.0, + generator: Optional[torch.Generator] = None, + return_dict: bool = True, + omega: Union[float, np.array] = 0.0 + ) -> Union[FlowMatchHeunDiscreteSchedulerOutput, Tuple]: + """ + Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`torch.FloatTensor`): + The direct output from learned diffusion model. + timestep (`float`): + The current discrete timestep in the diffusion chain. + sample (`torch.FloatTensor`): + A current instance of a sample created by the diffusion process. + s_churn (`float`): + s_tmin (`float`): + s_tmax (`float`): + s_noise (`float`, defaults to 1.0): + Scaling factor for noise added to the sample. + generator (`torch.Generator`, *optional*): + A random number generator. + return_dict (`bool`): + Whether or not to return a [`~schedulers.scheduling_Heun_discrete.HeunDiscreteSchedulerOutput`] or + tuple. + + Returns: + [`~schedulers.scheduling_Heun_discrete.HeunDiscreteSchedulerOutput`] or `tuple`: + If return_dict is `True`, [`~schedulers.scheduling_Heun_discrete.HeunDiscreteSchedulerOutput`] is + returned, otherwise a tuple is returned where the first element is the sample tensor. + """ + + def logistic_function(x, L=0.9, U=1.1, x_0=0.0, k=1): + # L = Lower bound + # U = Upper bound + # x_0 = Midpoint (x corresponding to y = 1.0) + # k = Steepness, can adjust based on preference + + if isinstance(x, torch.Tensor): + device_ = x.device + x = x.to(torch.float).cpu().numpy() + + new_x = L + (U - L) / (1 + np.exp(-k * (x - x_0))) + + if isinstance(new_x, np.ndarray): + new_x = torch.from_numpy(new_x).to(device_) + return new_x + + self.omega_bef_rescale = omega + omega = logistic_function(omega, k=0.1) + self.omega_aft_rescale = omega + + if ( + isinstance(timestep, int) + or isinstance(timestep, torch.IntTensor) + or isinstance(timestep, torch.LongTensor) + ): + raise ValueError( + ( + "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" + " `HeunDiscreteScheduler.step()` is not supported. Make sure to pass" + " one of the `scheduler.timesteps` as a timestep." + ), + ) + + if self.step_index is None: + self._init_step_index(timestep) + + # Upcast to avoid precision issues when computing prev_sample + sample = sample.to(torch.float32) + + if self.state_in_first_order: + sigma = self.sigmas[self.step_index] + sigma_next = self.sigmas[self.step_index + 1] + else: + # 2nd order / Heun's method + sigma = self.sigmas[self.step_index - 1] + sigma_next = self.sigmas[self.step_index] + + gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0 + + sigma_hat = sigma * (gamma + 1) + + if gamma > 0: + noise = randn_tensor( + model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator + ) + eps = noise * s_noise + sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5 + + if self.state_in_first_order: + # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise + denoised = sample - model_output * sigma + # 2. convert to an ODE derivative for 1st order + derivative = (sample - denoised) / sigma_hat + # 3. Delta timestep + dt = sigma_next - sigma_hat + + # store for 2nd order step + self.prev_derivative = derivative + self.dt = dt + self.sample = sample + else: + # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise + denoised = sample - model_output * sigma_next + # 2. 2nd order / Heun's method + derivative = (sample - denoised) / sigma_next + derivative = 0.5 * (self.prev_derivative + derivative) + + # 3. take prev timestep & sample + dt = self.dt + sample = self.sample + + # free dt and derivative + # Note, this puts the scheduler in "first order mode" + self.prev_derivative = None + self.dt = None + self.sample = None + + # original sample way + # prev_sample = sample + derivative * dt + + dx = derivative * dt + m = dx.mean() + dx_ = (dx - m) * omega + m + prev_sample = sample + dx_ + + # Cast sample back to model compatible dtype + prev_sample = prev_sample.to(model_output.dtype) + + # upon completion increase step index by one + self._step_index += 1 + + if not return_dict: + return (prev_sample,) + + return FlowMatchHeunDiscreteSchedulerOutput(prev_sample=prev_sample) + + def __len__(self): + return self.config.num_train_timesteps diff --git a/ui/auth.py b/ui/auth.py new file mode 100644 index 0000000..8dfcd63 --- /dev/null +++ b/ui/auth.py @@ -0,0 +1,3 @@ + +def same_auth(username, password): + return username == "timedomain_text2music_team" and password == "TimeDomain_ACEFlow_DEMO" diff --git a/ui/llm_prompt_gen.py b/ui/llm_prompt_gen.py new file mode 100644 index 0000000..7eade54 --- /dev/null +++ b/ui/llm_prompt_gen.py @@ -0,0 +1,44 @@ +from openai import OpenAI +from dotenv import load_dotenv + +load_dotenv() + + +random_genre_prompt = """randomly give me a short prompt that describes a music (with genre tag). less than 30 words +Here are some examples: +fusion jazz with synth, bass, drums, saxophone +Electronic, eerie, swing, dreamy, melodic, electro, sad, emotional +90s hip-hop, old school rap, turntablism, vinyl samples, instrumental loop +""" + + +def random_genre(): + client = OpenAI() + completion = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "system", "content": random_genre_prompt}], + max_tokens=30, + temperature=0.7, + ) + return completion.choices[0].message.content + + +optimize_genre_prompt = """optimize the following music descirption and make it more genre specific. less than 30 words +output examples: +fusion jazz with synth, bass, drums, saxophone +Electronic, eerie, swing, dreamy, melodic, electro, sad, emotional +90s hip-hop, old school rap, turntablism, vinyl samples, instrumental loop + +## input music descirption +""" + + +def optimize_genre(prompt): + client = OpenAI() + completion = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "system", "content": optimize_genre_prompt+prompt}], + max_tokens=30, + temperature=0.7, + ) + return completion.choices[0].message.content diff --git a/ui/text2music_large_lyric_components_v3.py b/ui/text2music_large_lyric_components_v3.py new file mode 100644 index 0000000..8968969 --- /dev/null +++ b/ui/text2music_large_lyric_components_v3.py @@ -0,0 +1,323 @@ +import gradio as gr +from pathlib import Path +import json +from collections import OrderedDict, Counter +import sys +import os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from language_segmentation import LangSegment + +MAX_GENERATE_LEN = 60 + + +SUPPORT_LANGUAGES = [ + "af", "sq", "am", "ar", "an", "hy", "az", "ba", "eu", "be", "bn", "bs", "bg", "my", "ca", "zh", "cs", "da", "nl", "en", "eo", "et", "fi", "fr", "gd", "ka", "de", "el", "gn", "gu", "hi", "hu", "io", "id", "ia", "it", "ja", "kk", "km", "ko", "ku", "la", "lt", "lb", "mk", "mt", "nb", "no", "or", "fa", "pl", "pt", "ro", "ru", "sa", "sr", "sd", "sk", "sl", "es", "sw", "sv", "tl", "ta", "tt", "th", "tr", "tk", "uk", "vi", "cy", "is", "ga", "gl", "se", "yue" +] + + +langseg = LangSegment() + +langseg.setfilters([ + 'af', 'am', 'an', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'dz', 'el', + 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fo', 'fr', 'ga', 'gl', 'gu', 'he', 'hi', 'hr', 'ht', 'hu', 'hy', + 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg', + 'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'nb', 'ne', 'nl', 'nn', 'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', + 'ro', 'ru', 'rw', 'se', 'si', 'sk', 'sl', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', 'th', 'tl', 'tr', 'ug', 'uk', + 'ur', 'vi', 'vo', 'wa', 'xh', 'zh', 'zu' +]) + +keyscale_idx_mapping = OrderedDict({ + "C major": 1, + "C# major": 2, + "D major": 3, + "Eb major": 4, + "E major": 5, + "F major": 6, + "F# major": 7, + "G major": 8, + "Ab major": 9, + "A major": 10, + "Bb major": 11, + "B major": 12, + "A minor": 13, + "Bb minor": 14, + "B minor": 15, + "C minor": 16, + "C# minor": 17, + "D minor": 18, + "Eb minor": 19, + "E minor": 20, + "F minor": 21, + "F# minor": 22, + "G minor": 23, + "Ab minor": 24 +}) + + +def get_checkpoint_paths(checkpoint_path): + # 获取指定目录中的所有checkpoint文件路径 + directory = Path(checkpoint_path).parent + checkpoints = [str(p) for p in directory.glob("*.ckpt")] + print(checkpoints) + return checkpoints + + +def create_list_checkpoint_path_ui(checkpoint_path): + with gr.Column(): + gr.Markdown("Checkpoint Selection") + with gr.Group(): + with gr.Row(equal_height=True): + with gr.Column(scale=9): + selected_checkpoint = gr.Dropdown( + choices=get_checkpoint_paths(checkpoint_path), + label="Select Model", + interactive=True, + value=checkpoint_path, + ) + with gr.Column(scale=1): + refresh_button = gr.Button("Refresh Checkpoints", elem_id="refresh_button", variant="primary") + refresh_button.click( + fn=lambda: gr.update(choices=get_checkpoint_paths(checkpoint_path)), + inputs=None, + outputs=[selected_checkpoint] + ) + return selected_checkpoint + + +def create_keyscale_bpm_time_signature_input_ui(options=["auto", "manual"]): + gr.Markdown("### Time and Keyscale Control") + with gr.Group(): + results = [ + ["keyscale", 0], + ["bpm", 0], + ["timesignature", 0], + ["is_music_start", 0], + ["is_music_end", 0], + ] + keyscale_bpm_time_signature_input = gr.List(visible=False, elem_id="keyscale_bpm_time_signature_input", value=results) + audio_duration = gr.Slider(10, 600, step=1, value=MAX_GENERATE_LEN, label="Audio Duration", interactive=True) + with gr.Row(): + is_music_start_input = gr.Radio(["auto", "start", "not_start"], value="auto", label="Is Music Start", elem_id="is_music_start_input") + is_music_end_input = gr.Radio(["auto", "end", "not_end"], value="auto", label="Is Music End", elem_id="is_music_end_input") + + def when_is_music_start_input_change( + is_music_start_input, + ): + nonlocal results + if is_music_start_input == "auto": + is_music_start = 0 + elif is_music_start_input == "start": + is_music_start = 1 + else: + is_music_start = 2 + results[3][1] = is_music_start + return gr.update(elem_id="keyscale_bpm_time_signature_input", value=results) + + is_music_start_input.change( + when_is_music_start_input_change, + inputs=[is_music_start_input], + outputs=[keyscale_bpm_time_signature_input] + ) + + def when_is_music_end_input_change( + is_music_end_input, + ): + nonlocal results + if is_music_end_input == "auto": + is_music_end = 0 + elif is_music_end_input == "end": + is_music_end = 1 + else: + is_music_end = 2 + results[4][1] = is_music_end + return gr.update(elem_id="keyscale_bpm_time_signature_input", value=results) + + is_music_end_input.change( + when_is_music_end_input_change, + inputs=[is_music_end_input], + outputs=[keyscale_bpm_time_signature_input] + ) + + with gr.Row(): + keyscale_control = gr.Radio(options, value="auto", label="Keyscale", elem_id="keyscale_control") + bpm_control = gr.Radio(options, value="auto", label="BPM", elem_id="bpm_control") + time_signature_control = gr.Radio(options, value="auto", label="Time Signature", elem_id="time_signature_control") + + keyscale_input = gr.Dropdown(list(keyscale_idx_mapping.keys()), label="Keyscale", info="the keyscale of the music", visible=False, elem_id="keyscale_input") + + def when_keyscale_change( + keyscale_input, + keyscale_control, + ): + nonlocal results + keyscale = keyscale_input + if keyscale_control == "auto": + keyscale = 0 + results[0][1] = keyscale + return [gr.update(elem_id="keyscale_bpm_time_signature_input", value=results), gr.update(elem_id="keyscale_input", visible=(keyscale_control == "manual"))] + + keyscale_input.change( + when_keyscale_change, + inputs=[keyscale_input, keyscale_control], + outputs=[keyscale_bpm_time_signature_input, keyscale_input] + ) + keyscale_control.change( + fn=when_keyscale_change, + inputs=[keyscale_input, keyscale_control], + outputs=[keyscale_bpm_time_signature_input, keyscale_input] + ) + + bpm_input = gr.Slider(30, 200, step=1, value=120, label="BPM", info="the beats per minute of the music", visible=False, interactive=True, elem_id="bpm_input") + + def when_bmp_change( + bpm_input, + bpm_control, + ): + nonlocal results + bpm = bpm_input + if bpm_control == "auto": + bpm = 0 + results[1][1] = bpm + updates = [gr.update(elem_id="keyscale_bpm_time_signature_input", value=results), gr.update(elem_id="bpm_input", visible=(bpm_control == "manual"))] + return updates + + bpm_control.change( + fn=when_bmp_change, + inputs=[bpm_input, bpm_control], + outputs=[keyscale_bpm_time_signature_input, bpm_input] + ) + + bpm_input.change( + when_bmp_change, + inputs=[bpm_input, bpm_control], + outputs=[keyscale_bpm_time_signature_input, bpm_input] + ) + + time_signature_input = gr.Slider(1, 12, step=1, value=4, label="Time Signature", info="the time signature of the music", visible=False, interactive=True, elem_id="time_signature_input") + + def when_time_signature_change( + time_signature_input, + time_signature_control, + ): + nonlocal results + time_signature = time_signature_input + if time_signature_control == "auto": + time_signature = 0 + results[2][1] = time_signature + return [gr.update(elem_id="keyscale_bpm_time_signature_input", value=results), gr.update(elem_id="time_signature_input", visible=(time_signature_control == "manual"))] + + time_signature_input.change( + when_time_signature_change, + inputs=[time_signature_input, time_signature_control], + outputs=[keyscale_bpm_time_signature_input, time_signature_input] + ) + time_signature_control.change( + fn=when_time_signature_change, + inputs=[time_signature_input, time_signature_control], + outputs=[keyscale_bpm_time_signature_input, time_signature_input] + ) + + return [audio_duration, keyscale_bpm_time_signature_input] + + +def detect_language(lyrics: str) -> list: + lyrics = lyrics.strip() + if not lyrics: + return gr.update(value="en") + langs = langseg.getTexts(lyrics) + lang_counter = Counter() + for lang in langs: + lang_counter[lang["lang"]] += len(lang["text"]) + lang = lang_counter.most_common(1)[0][0] + return lang + + +def create_output_ui(): + target_audio = gr.Audio(type="filepath", label="Target Audio") + output_audio1 = gr.Audio(type="filepath", label="Generated Audio 1") + output_audio2 = gr.Audio(type="filepath", label="Generated Audio 2") + input_params_json = gr.JSON(label="Input Parameters") + outputs = [output_audio1, output_audio2] + return outputs, target_audio, input_params_json + + +def dump_func(*args): + print(args) + return [] + + +def create_main_demo_ui( + checkpoint_path="checkpoints/aceflow3_0311/1d_epoch=16-step=140k.ckpt", + text2music_process_func=dump_func, + sample_data_func=dump_func, +): + with gr.Blocks( + title="AceFlow 3.0 DEMO (3.5B)", + ) as demo: + gr.Markdown( + """ +

AceFlow 3.0 DEMO

+ """ + ) + selected_checkpoint = create_list_checkpoint_path_ui(checkpoint_path) + + gr.Markdown("Dataset Filter") + with gr.Group(): + with gr.Row(equal_height=True): + language = gr.Dropdown(["en", "zh"], label="Language", value="en", elem_id="language") + dataset_example_idx = gr.Number( + value=-1, + label="Dataset Example Index", + interactive=True + ) + sample_bnt = gr.Button(value="Sample Data", elem_id="sample_bnt", variant="primary") + + with gr.Row(): + with gr.Column(): + audio_duration = gr.Slider(10, 600, step=1, value=MAX_GENERATE_LEN, label="Audio Duration", interactive=True) + + prompt = gr.Textbox(lines=2, label="Tags", max_lines=4) + lyrics = gr.Textbox(lines=9, label="Lyrics", max_lines=9) + + scheduler_type = gr.Radio(["euler", "heun"], value="euler", label="Scheduler Type", elem_id="scheduler_type") + cfg_type = gr.Radio(["cfg", "apg"], value="apg", label="CFG Type", elem_id="cfg_type") + infer_step = gr.Slider(minimum=1, maximum=1000, step=1, value=60, label="Infer Steps", interactive=True) + guidance_scale = gr.Slider(minimum=0.0, maximum=200.0, step=0.1, value=15.0, label="Guidance Scale", interactive=True) + omega_scale = gr.Slider(minimum=-100.0, maximum=100.0, step=0.1, value=10.0, label="Granularity Scale", interactive=True) + manual_seeds = gr.Textbox(label="manual seeds (default None)", placeholder="1,2,3,4", value=None) + + text2music_bnt = gr.Button(variant="primary") + with gr.Column(): + outputs, target_audio, input_params_json = create_output_ui() + + sample_bnt.click( + sample_data_func, + inputs=[dataset_example_idx, audio_duration], + outputs=[target_audio, prompt, lyrics, input_params_json], + ) + text2music_bnt.click( + fn=text2music_process_func, + inputs=[ + audio_duration, + prompt, + lyrics, + input_params_json, + selected_checkpoint, + scheduler_type, + cfg_type, + infer_step, + guidance_scale, + omega_scale, + manual_seeds, + ], outputs=outputs + [input_params_json] + ) + + return demo + + +if __name__ == "__main__": + demo = create_main_demo_ui() + demo.launch( + server_name="0.0.0.0", + server_port=7860, + ) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..35127f2 --- /dev/null +++ b/utils.py @@ -0,0 +1,197 @@ +from loguru import logger +import functools +import numpy as np +import time +import librosa +import sys +import yaml +from threading import Thread + + +logger.remove() +logger.add(sys.stderr, format="{time} {level} {message}", level="INFO") + + +def async_thread(f): + + def wrapper(*args, **kwargs): + t = Thread(target=f, args=args, kwargs=kwargs) + t.start() + + return wrapper + + +def timecost(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + start = time.time() + result = func(*args, **kwargs) + end = time.time() + logger.info(f"{func.__name__} took {end - start} seconds to run") + return result + return wrapper + + +def autocut(wav, min_cut_second=9.9, sample_rate=16_000, frame_length=2048, hop_length=512, cut_threshold=[2e-5, 1, 2**0.5], min_mute_duration=120, min_tail_second=2): + segs = [] + seg_lengths = [] + longest_wav_frames = int(min_cut_second * sample_rate) + if len(wav) < longest_wav_frames: + segs.append(wav) + seg_lengths.append(len(wav)) + return segs, seg_lengths + + # 自适应阈值算法找静音切分点 + candidate_cut_positions = [] + candidate_cut_durations = [] + cut_threshold, cut_threshold_max, cut_step_multiple = cut_threshold + + for i in range(8): + rms = librosa.feature.rms(y=wav, frame_length=frame_length, hop_length=hop_length)[0] + is_mute_mask = rms <= cut_threshold + is_mute = np.zeros_like(rms, dtype='bool') + is_mute[is_mute_mask], is_mute[~is_mute_mask] = True, False + # logger.info(f"{rms.mean()=}, {rms.min()=}, {rms.max()=}, {cut_threshold=}, {is_mute_mask.sum()=}") + last_start = 0 + last_position = 0 + curr_cut_positions = [] + curr_cut_durations = [] + interrupt = False + for i in range(len(is_mute) - 1): + # 从有到无 + if not is_mute[i] and is_mute[i + 1]: + last_start = i + # 从无到有 + if is_mute[i] and not is_mute[i + 1]: + # 静音部分至少大于等于min_mute_duration + mute_duration = (i - last_start) * \ + hop_length / (sample_rate / 1000) + if mute_duration >= min_mute_duration: + # 切分规则:在静音中间部分作为分割点 + # 还原到wav的帧 + mid = (i + last_start) // 2 + cut_position = mid * hop_length + curr_duration = cut_position - last_position + # 若超了,切分成四份 + if (longest_wav_frames // 2) < curr_duration: + left_cut_position = (last_start+mid) // 2 * hop_length + left_curr_duration = left_cut_position - last_position + curr_cut_positions.append(left_cut_position) + curr_cut_durations.append(left_curr_duration) + last_position = left_cut_position + + right_cut_position = (mid+i) // 2 * hop_length + right_curr_duration = right_cut_position - last_position + curr_cut_positions.append(right_cut_position) + curr_cut_durations.append(right_curr_duration) + last_position = right_cut_position + else: + curr_cut_positions.append(cut_position) + curr_cut_durations.append(curr_duration) + last_position = cut_position + + candidate_cut_positions = curr_cut_positions + candidate_cut_durations = curr_cut_durations + if cut_threshold >= cut_threshold_max: + break + if cut_threshold < cut_threshold_max: + if len(curr_cut_durations) == 0: + curr_cut_positions.append(len(wav)) + curr_cut_durations.append(len(wav)) + else: + curr_cut_positions.append(len(wav)) + curr_cut_durations.append( + curr_cut_positions[-1] - curr_cut_positions[-2]) + max_duration = max(curr_cut_durations) + if max_duration >= longest_wav_frames: + interrupt = True + cut_threshold = cut_threshold * cut_step_multiple + min_mute_duration = int(max(min_mute_duration/cut_step_multiple, 10)) + frame_length = int(max(frame_length / cut_step_multiple, 256)) + hop_length = int(max(hop_length / cut_step_multiple, 64)) + # logger.info(f"Adaptively adjust the threshold: {cut_threshold=} {min_mute_duration=} {frame_length=} {hop_length=} {len(curr_cut_durations)=}") + if not interrupt and len(curr_cut_durations) > 0: + candidate_cut_positions = curr_cut_positions + candidate_cut_durations = curr_cut_durations + break + + # logger.info(f"candidate_cut_positions {candidate_cut_positions}") + # logger.info(f"candidate_cut_durations {candidate_cut_durations}") + # 从已有切分点中找最接近最大长度的切分点 + curr_duration = 0 + last_start = 0 + for i, duration in enumerate(candidate_cut_durations): + curr_duration += duration + # 若超出最大限制,以上一个点作为实际切分 + if curr_duration > longest_wav_frames: + segs.append(wav[last_start:candidate_cut_positions[i - 1]]) + seg_lengths.append(curr_duration - duration) + curr_duration = duration + last_start = candidate_cut_positions[i - 1] + if len(candidate_cut_durations) == 0 or (len(candidate_cut_durations)==1 and candidate_cut_durations[0] >= len(wav)): + logger.info("自动切分算法失败,按最长强制切分") + # 按最长强制切分 + last_start = 0 + segs = [] + seg_lengths = [] + for end in range(longest_wav_frames, max(longest_wav_frames, len(wav)), longest_wav_frames): + segs.append(wav[last_start:end]) + seg_lengths.append(end-last_start) + last_start = end + # 解决尾部问题 + if sum(seg_lengths) < len(wav): + for end in range(last_start+longest_wav_frames, max(longest_wav_frames, len(wav)), longest_wav_frames): + segs.append(wav[last_start:end]) + seg_lengths.append(end - last_start) + last_start = end + if sum(seg_lengths) < len(wav): + last_start = sum(seg_lengths) + tail_frame = len(wav) - last_start + if len(segs) > 0 and tail_frame < min_tail_second*sample_rate: + segs.pop() + seg_lengths.pop() + last_start = sum(seg_lengths) + segs.append(wav[last_start:]) + seg_lengths.append(len(wav) - last_start) + + if any([len(seg) > longest_wav_frames for seg in segs]): + new_segs = [] + new_seg_lengths = [] + for seg, seg_length in zip(segs, seg_lengths): + num_cut = len(seg) // longest_wav_frames + num_cut += 1 if len(seg) % longest_wav_frames > 0 else 0 + for i in range(num_cut): + new_segs.append(seg[i*longest_wav_frames:(i+1)*longest_wav_frames]) + new_seg_lengths.append(len(new_segs[-1])) + segs, seg_lengths = new_segs, new_seg_lengths + return segs, seg_lengths + + +class ConfigObj: + + def __init__(self, d): + self.__dict__.update(d) + + def __repr__(self) -> str: + return repr(self.__dict__) + + def __str__(self) -> str: + return str(self.__dict__) + + def __getitem__(self, k): + return self.__dict__[k] + + def get(self, k, default=None): + if k in self.__dict__: + return self[k] + else: + return default + + def __setitem__(self, k, v): + self.__dict__[k] = v + + +def load_config(config_path): + with open(config_path, encoding='utf-8') as yaml_file: + config = yaml.safe_load(yaml_file) + return ConfigObj(config)