import numpy as np s = " `1234567890-=~!@#$%^&*()_+[;,{:<];.}:>\\'/|\"?\n–№…«»→" def split(text): o = [] t = "" for i in text+" ": if i in s: if t != "": o.append(t) t = "" if i != " ": o.append(i) t = "" else: t += i return o def tokenize_2str(text: str): text = split(text) o = [] for i in text: if i[-2:] == "es": o.append(i[:-2]) o.append("") else: o.append(i) return o ind2text = ["", "", ""] text2ind = {"": 0, "": 1, "": 2} def fit_on_text(text: str): global ind2text global text2ind tokens = tokenize_2str(text) for i in tokens: if i not in ind2text: ind2text.append(i) text2ind[i] = len(ind2text) - 1 def fit_on_texts(texts): for text in texts: fit_on_text(text) def tokenize(text: str): text = tokenize_2str(text) o = [] for i in text: if i in ind2text: o.append(text2ind[i]) else: o.append(text2ind['']) return np.array(o)