Commit e49082d0 authored by Ubuntu's avatar Ubuntu

formatting updated

parent 70703045
from collections import defaultdict
_styleConstraints = defaultdict(lambda: None)
_styleConstraints['ep'] = "▁sa ▁Sa ▁SA ▁sina ▁su ▁Su ▁sinu ▁Sinu ▁sul ▁Sul ▁sulle ▁sinuga ▁arvad ▁tahad ▁oled ▁soovid ▁du ▁dir ▁dich ▁dein ▁deine ▁deinen ▁deiner ▁deines ▁du ▁dir ▁dich ▁dein ▁deine ▁deinen ▁deiner ▁deines".split(" ")
_styleConstraints['ep'] += "▁ты ▁Ты ▁тебя ▁тебе ▁Тебе ▁тобой ▁твой ▁твоё ▁твоему ▁твоим ▁твои".split(" ")
_styleConstraints['ep'] += "▁tu ▁Tu ▁tev ▁tevi".split(" ")
_styleConstraints[
"ep"
] = "▁sa ▁Sa ▁SA ▁sina ▁su ▁Su ▁sinu ▁Sinu ▁sul ▁Sul ▁sulle ▁sinuga ▁arvad ▁tahad ▁oled ▁soovid ▁du ▁dir ▁dich ▁dein ▁deine ▁deinen ▁deiner ▁deines ▁du ▁dir ▁dich ▁dein ▁deine ▁deinen ▁deiner ▁deines".split(
" "
)
_styleConstraints[
"ep"
] += "▁ты ▁Ты ▁тебя ▁тебе ▁Тебе ▁тобой ▁твой ▁твоё ▁твоему ▁твоим ▁твои".split(" ")
_styleConstraints["ep"] += "▁tu ▁Tu ▁tev ▁tevi".split(" ")
_styleConstraints['os'] = "▁te ▁Te ▁teie ▁teid ▁teile ▁Teile ▁teil ▁Teil ▁teilt ▁Teilt ▁Sie ▁Ihne ▁Ihnen ▁Ihner ▁Ihnes ▁Ihn".split(" ")
_styleConstraints['os'] += "▁sir ▁Sir ▁ser ▁Ser".split(" ")
_styleConstraints['os'] += "▁сэр".split(" ")
_styleConstraints['os'] += "▁söör".split(" ")
_styleConstraints['os'] += "▁вы ▁Вы ▁вас ▁Вас ▁вам ▁Вам ▁вами ▁ваш ▁Ваш ▁ваши ▁вашего".split(" ")
_styleConstraints['os'] += "▁jūs ▁Jūs ▁jūsu ▁jums ▁Jums".split(" ")
_styleConstraints[
"os"
] = "▁te ▁Te ▁teie ▁teid ▁teile ▁Teile ▁teil ▁Teil ▁teilt ▁Teilt ▁Sie ▁Ihne ▁Ihnen ▁Ihner ▁Ihnes ▁Ihn".split(
" "
)
_styleConstraints["os"] += "▁sir ▁Sir ▁ser ▁Ser".split(" ")
_styleConstraints["os"] += "▁сэр".split(" ")
_styleConstraints["os"] += "▁söör".split(" ")
_styleConstraints[
"os"
] += "▁вы ▁Вы ▁вас ▁Вас ▁вам ▁Вам ▁вами ▁ваш ▁Ваш ▁ваши ▁вашего".split(" ")
_styleConstraints["os"] += "▁jūs ▁Jūs ▁jūsu ▁jums ▁Jums".split(" ")
def getPolitenessConstraints():
return _styleConstraints
def getPolitenessConstraints():
return _styleConstraints
......@@ -2,9 +2,10 @@ import sys
from datetime import datetime
def log(msg):
msg = "[DEBUG {0}] {1}\n".format(datetime.now(), msg)
for channel in (sys.stderr,):
#for channel in (sys.stderr, sys.stdout):
channel.write(msg)
msg = "[DEBUG {0}] {1}\n".format(datetime.now(), msg)
for channel in (sys.stderr,):
# for channel in (sys.stderr, sys.stdout):
channel.write(msg)
This diff is collapsed.
#!/usr/bin/python3
#-*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import socket
import json
def _politeness(c, expected, response = False):
msg = c.recv(128)
#assert(msg == expected)
print(str(msg) + " received")
if msg == expected:
if response:
c.send(response)
else:
c.send(b"Fail")
def startServer(msgProcFunc, msgProcArgs, host = '172.17.66.215', port = 12349, log = False):
s = socket.socket()
s.bind((host, port))
try:
while True:
try:
s.listen(5)
c, a = s.accept()
_politeness(c, b"HI", b"okay")
oversized = False
msg = c.recv(2048)
if msg.startswith(b"msize:"):
inMsgSize = int(msg.strip().split(b":")[1])
c.send(b'still okay')
msg = c.recv(inMsgSize + 13)
if msg:
responseMsg = msgProcFunc(msg, *msgProcArgs)
else:
responseMsg = bytes(json.dumps({ 'final_trans': '' }), 'utf-8')
if len(responseMsg) > 1024:
print("size warning sent: " + str(len(responseMsg)))
c.send(bytes("msize:" + str(len(responseMsg) + 13), 'ascii'))
_politeness(c, b"OK")
c.send(responseMsg)
c.close()
except Exception as e:
print("ERROR", e)
c.send(bytes(json.dumps({ 'status': 'err', 'exception': str(e) }), 'utf-8'))
finally:
#s.shutdown()
s.close()
print("closed connection")
def _politeness(c, expected, response=False):
msg = c.recv(128)
# assert(msg == expected)
print(str(msg) + " received")
if msg == expected:
if response:
c.send(response)
else:
c.send(b"Fail")
def startServer(msgProcFunc, msgProcArgs, host="172.17.66.215", port=12349, log=False):
s = socket.socket()
s.bind((host, port))
try:
while True:
try:
s.listen(5)
c, a = s.accept()
_politeness(c, b"HI", b"okay")
oversized = False
msg = c.recv(2048)
if msg.startswith(b"msize:"):
inMsgSize = int(msg.strip().split(b":")[1])
c.send(b"still okay")
msg = c.recv(inMsgSize + 13)
if msg:
responseMsg = msgProcFunc(msg, *msgProcArgs)
else:
responseMsg = bytes(json.dumps({"final_trans": ""}), "utf-8")
if len(responseMsg) > 1024:
print("size warning sent: " + str(len(responseMsg)))
c.send(bytes("msize:" + str(len(responseMsg) + 13), "ascii"))
_politeness(c, b"OK")
c.send(responseMsg)
c.close()
except Exception as e:
print("ERROR", e)
c.send(
bytes(json.dumps({"status": "err", "exception": str(e)}), "utf-8")
)
finally:
# s.shutdown()
s.close()
print("closed connection")
......@@ -12,103 +12,134 @@ from log import log
from collections import namedtuple
from sockeye.translate import inference
def _preprocess(sentence, index, lang_factor, style_factor,
models, constraints):
truecased_sentence = applytc.processLine(models.truecaser, sentence)
pieces = models.segmenter.EncodeAsPieces(truecased_sentence)
segmented_sentence = ' '.join(pieces)
rawlen = len(pieces)
prejsson = { 'text': segmented_sentence, 'factors': [" ".join([lang_factor] * rawlen), " ".join([style_factor] * rawlen), " ".join(['f0'] * rawlen), " ".join(['g0'] * rawlen)]}
try:
if constraints and constraints[index]:
prejsson['avoid'] = constraints[index]
except IndexError as e:
sys.stderr.write(str(constraints) + ", " + str(index))
raise e
jsson = json.dumps(prejsson)
log("PREPROC received '" + sentence + "', turned it into '" + segmented_sentence + "'")
return jsson
def _preprocess(sentence, index, lang_factor, style_factor, models, constraints):
truecased_sentence = applytc.processLine(models.truecaser, sentence)
pieces = models.segmenter.EncodeAsPieces(truecased_sentence)
segmented_sentence = " ".join(pieces)
rawlen = len(pieces)
prejsson = {
"text": segmented_sentence,
"factors": [
" ".join([lang_factor] * rawlen),
" ".join([style_factor] * rawlen),
" ".join(["f0"] * rawlen),
" ".join(["g0"] * rawlen),
],
}
try:
if constraints and constraints[index]:
prejsson["avoid"] = constraints[index]
except IndexError as e:
sys.stderr.write(str(constraints) + ", " + str(index))
raise e
jsson = json.dumps(prejsson)
log(
"PREPROC received '"
+ sentence
+ "', turned it into '"
+ segmented_sentence
+ "'"
)
return jsson
def _doMany(many, func, args):
return [func(one, idx, *args) for idx, one in enumerate(many)]
return [func(one, idx, *args) for idx, one in enumerate(many)]
def _postprocess(sentence, idx, models):
de_segmented_sentence = models.segmenter.DecodePieces(sentence.split())
try:
de_truecased_sentence = de_segmented_sentence[0].upper() + de_segmented_sentence[1:]
except:
de_truecased_sentence = de_segmented_sentence
log("POSTPROC received '" + sentence + "', turned it into '" + de_truecased_sentence + "'")
return de_truecased_sentence
de_segmented_sentence = models.segmenter.DecodePieces(sentence.split())
try:
de_truecased_sentence = (
de_segmented_sentence[0].upper() + de_segmented_sentence[1:]
)
except:
de_truecased_sentence = de_segmented_sentence
log(
"POSTPROC received '"
+ sentence
+ "', turned it into '"
+ de_truecased_sentence
+ "'"
)
return de_truecased_sentence
def _forward(sentences, models):
trans_inputs = [inference.make_input_from_json_string(sentence_id=i, json_string=sentence, translator=models.translator) for i, sentence in enumerate(sentences)]
outputs = models.translator.translate(trans_inputs)
return [(output.translation, output.score) for output in outputs]
def _loadTranslator(model_folders, ctx = mx.cpu()):
models, source_vocabs, target_vocab = inference.load_models(
context=ctx,
max_input_len=None,
beam_size=3,
batch_size=16,
model_folders=model_folders,
checkpoints=None,
softmax_temperature=None,
max_output_length_num_stds=2,
decoder_return_logit_inputs=False,
cache_output_layer_w_b=False)
return inference.Translator(context=ctx,
ensemble_mode="linear",
bucket_source_width=10,
length_penalty=inference.LengthPenalty(1.0, 0.0),
beam_prune=0,
beam_search_stop='all',
models=models,
source_vocabs=source_vocabs,
target_vocab=target_vocab,
restrict_lexicon=None,
store_beam=False,
strip_unknown_words=False)
trans_inputs = [
inference.make_input_from_json_string(
sentence_id=i, json_string=sentence, translator=models.translator
)
for i, sentence in enumerate(sentences)
]
outputs = models.translator.translate(trans_inputs)
return [(output.translation, output.score) for output in outputs]
def _loadTranslator(model_folders, ctx=mx.cpu()):
models, source_vocabs, target_vocab = inference.load_models(
context=ctx,
max_input_len=None,
beam_size=3,
batch_size=16,
model_folders=model_folders,
checkpoints=None,
softmax_temperature=None,
max_output_length_num_stds=2,
decoder_return_logit_inputs=False,
cache_output_layer_w_b=False,
)
return inference.Translator(
context=ctx,
ensemble_mode="linear",
bucket_source_width=10,
length_penalty=inference.LengthPenalty(1.0, 0.0),
beam_prune=0,
beam_search_stop="all",
models=models,
source_vocabs=source_vocabs,
target_vocab=target_vocab,
restrict_lexicon=None,
store_beam=False,
strip_unknown_words=False,
)
def loadModels(translationModelPath, truecaserModelPath, segmenterModelPath):
"""Load translation, truecasing and segmentation models and
"""Load translation, truecasing and segmentation models and
return them as a named tuple"""
translationModel = _loadTranslator([translationModelPath,])
truecaserModel = applytc.loadModel(truecaserModelPath)
segmenterModel = spm.SentencePieceProcessor()
segmenterModel.Load(segmenterModelPath)
Models = namedtuple("Models", ["translator", "truecaser", "segmenter"])
return Models(translationModel, truecaserModel, segmenterModel)
translationModel = _loadTranslator([translationModelPath,])
truecaserModel = applytc.loadModel(truecaserModelPath)
segmenterModel = spm.SentencePieceProcessor()
segmenterModel.Load(segmenterModelPath)
Models = namedtuple("Models", ["translator", "truecaser", "segmenter"])
return Models(translationModel, truecaserModel, segmenterModel)
def translate(models, sentences, outputLanguage, outputStyle, constraints):
"""Take list of sentences, output language and style as well as a list of constraints,
"""Take list of sentences, output language and style as well as a list of constraints,
and feed them through a set of loaded NMT models.
Return list of translations, list of scores, list of preprocessed input sentences and list of raw translations prior to postprocessing."""
cleaninputs = _doMany(sentences, _preprocess, (outputLanguage, outputStyle, models, constraints))
scoredTranslations = _forward(cleaninputs, models)
translations, scores = zip(*scoredTranslations)
postprocessed_translations = _doMany(translations, _postprocess, (models,))
return postprocessed_translations, scores, cleaninputs, translations
cleaninputs = _doMany(
sentences, _preprocess, (outputLanguage, outputStyle, models, constraints)
)
scoredTranslations = _forward(cleaninputs, models)
translations, scores = zip(*scoredTranslations)
postprocessed_translations = _doMany(translations, _postprocess, (models,))
return postprocessed_translations, scores, cleaninputs, translations
......@@ -5,88 +5,102 @@ import re
from truecaser.learntc import log, tokens
class DefUniqDict(dict):
def __missing__(self, key):
return key
class WordFreqTuple():
def __init__(self, word, freq):
self.word = word
self.freq = freq
def loadModel(filename, freqs = False):
res = DefUniqDict()
with open(filename, 'r') as filehandle:
for l in filehandle:
try:
w, f = l.strip().split('\t')
except:
#sys.stderr.write(l)
#raise e
w = l.strip()
f = 5
res[w.lower()] = WordFreqTuple(w, int(f))
return res
def __missing__(self, key):
return key
class WordFreqTuple:
def __init__(self, word, freq):
self.word = word
self.freq = freq
def loadModel(filename, freqs=False):
res = DefUniqDict()
with open(filename, "r") as filehandle:
for l in filehandle:
try:
w, f = l.strip().split("\t")
except:
# sys.stderr.write(l)
# raise e
w = l.strip()
f = 5
res[w.lower()] = WordFreqTuple(w, int(f))
return res
def isUpper(w):
return re.search(r'[A-Z]', w) and not re.search(r'[a-z]', w)
return re.search(r"[A-Z]", w) and not re.search(r"[a-z]", w)
def truecase(model, wordlist):
return [model[w.lower()].word if (w.lower() in model and (i == 0 or isUpper(w) or wordlist[i-1] in ".:;?!")) else w for i, w in enumerate(wordlist)]
return [
model[w.lower()].word
if (w.lower() in model and (i == 0 or isUpper(w) or wordlist[i - 1] in ".:;?!"))
else w
for i, w in enumerate(wordlist)
]
def updateToken(line, span, newtoken):
return line[:span[0]] + newtoken + line[span[1]:]
return line[: span[0]] + newtoken + line[span[1] :]
def processLine(model, line):
try:
toks = tokens(line)
words, spans = zip(*toks)
try:
toks = tokens(line)
words, spans = zip(*toks)
tcwords = truecase(model, words)
tcwords = truecase(model, words)
resline = line
resline = line
for w, s in zip(tcwords, spans):
resline = updateToken(resline, s, w)
for w, s in zip(tcwords, spans):
resline = updateToken(resline, s, w)
return resline
except:
return line
return resline
except:
return line
def processLines(model, fh):
logFreq = 100000
i = 0
for line in fh:
try:
print(processLine(model, line.strip()))
except ValueError as e:
sys.stderr.write("empty\n")
print("")
pass
i += 1
if not i % logFreq:
log("processed {0} lines".format(i))
if i % logFreq:
log("processed {0} lines".format(i))
if __name__ == '__main__':
modelfile = sys.argv[1]
model = loadModel(modelfile)
try:
filename = sys.argv[2]
except IndexError:
filename = '-'
if filename == '-':
processLines(model, sys.stdin)
else:
with open(filename, 'r') as fh:
processLines(model, fh)
logFreq = 100000
i = 0
for line in fh:
try:
print(processLine(model, line.strip()))
except ValueError as e:
sys.stderr.write("empty\n")
print("")
pass
i += 1
if not i % logFreq:
log("processed {0} lines".format(i))
if i % logFreq:
log("processed {0} lines".format(i))
if __name__ == "__main__":
modelfile = sys.argv[1]
model = loadModel(modelfile)
try:
filename = sys.argv[2]
except IndexError:
filename = "-"
if filename == "-":
processLines(model, sys.stdin)
else:
with open(filename, "r") as fh:
processLines(model, fh)
......@@ -7,75 +7,81 @@ from collections import defaultdict
from operator import itemgetter
from datetime import datetime
def log(msg):
sys.stderr.write("{0}: {1}\n".format(str(datetime.now()), msg))
sys.stderr.write("{0}: {1}\n".format(str(datetime.now()), msg))
def tokens(line):
for m in re.finditer(r'\b\S+\b', line.strip()):
yield m.group(0), m.span()
for m in re.finditer(r"\b\S+\b", line.strip()):
yield m.group(0), m.span()
def learnModel(lines):
log("learning")
rawmodel = defaultdict(lambda: defaultdict(int))
logFreq = 100000
i = 0
for line in lines:
#for words = gettoks(line)
for word, _ in tokens(line):
rawmodel[word.lower()][word] += 1
i += 1
if not i % logFreq:
log("read {0} lines".format(i))
if i % logFreq:
log("read {0} lines".format(i))
return compressModel(rawmodel)
log("learning")
rawmodel = defaultdict(lambda: defaultdict(int))
logFreq = 100000
i = 0
for line in lines:
# for words = gettoks(line)
for word, _ in tokens(line):
rawmodel[word.lower()][word] += 1
i += 1
if not i % logFreq:
log("read {0} lines".format(i))
if i % logFreq:
log("read {0} lines".format(i))
return compressModel(rawmodel)
def compressModel(rawmodel):
log("compressing")
model = dict()
for key in rawmodel:
sortedItems = sorted(rawmodel[key].items(), key=itemgetter(1), reverse=True)
totFreq = sum(rawmodel[key].values())
if totFreq > 1 and re.search(r'[a-z]', key):
if (len(sortedItems) > 1 and sortedItems[0][1] == sortedItems[1][1]):
winner = max(sortedItems[0][0], sortedItems[1][0])
else:
winner = sortedItems[0][0]
#model.append(winner)
model[winner] = rawmodel[key][winner]
return model
log("compressing")
model = dict()
for key in rawmodel:
sortedItems = sorted(rawmodel[key].items(), key=itemgetter(1), reverse=True)
totFreq = sum(rawmodel[key].values())
if totFreq > 1 and re.search(r"[a-z]", key):
if len(sortedItems) > 1 and sortedItems[0][1] == sortedItems[1][1]:
winner = max(sortedItems[0][0], sortedItems[1][0])
else: