Commit ad0a687c authored by Marko Kollo's avatar Marko Kollo
Browse files

MLP Mini fixes.

parent 0216b675
import json
from utils.mlp_task_adapter import MLPTaskAdapter
from utils.mlp_task_adapter import Helpers
class MLPLitePreprocessor(object):
"""
Cleans texts for classification. Lemmatizes text.
"""
def _reload_env(self):
import dotenv
dotenv.load_dotenv(".env")
def __init__(self, mlp_url=None):
self.mlp_url = mlp_url
self._reload_env()
@staticmethod
def _process_stats(stats):
""" Processes stats from TextCleaner to make them categorical
......@@ -39,6 +42,7 @@ class MLPLitePreprocessor(object):
return processed_stats
def transform(self, documents, **kwargs):
"""Takes input documents and creates new fields for further commentary analysis.
:param documents: collection of dictionaries to enhance
......@@ -48,32 +52,50 @@ class MLPLitePreprocessor(object):
:rtype: list of dicts
"""
if not kwargs.get('text_cleaner_preprocessor_feature_names', None):
# this is mostly for API requests as they might not have field data - apply to all in this case
input_features = list(documents[0].keys())
else:
input_features = json.loads(kwargs['text_cleaner_preprocessor_feature_names'])
# this is mostly for API requests as they might not have field data - apply to all in this case
feature_names = kwargs.get('mlp-lite_feature_names', None)
output_type = kwargs.get('mlp-lite_output_type', None)
if not kwargs.get('mlp_lite_output_type', None):
output_type = 'full'
else:
output_type = json.loads(kwargs['mlp_lite_output_type'])
input_features = json.loads(feature_names) if feature_names else list(documents[0].keys())
output_type = json.loads(kwargs['mlp-lite_output_type']) if output_type else 'full'
for input_feature in input_features:
try:
input_feature = json.loads(input_feature)["path"]
input_feature = json.loads(input_features)["path"]
except:
pass
texts = [document[input_feature] for document in documents if input_feature in document]
data = {'texts': json.dumps(texts, ensure_ascii=False)}
input_feature_path = input_feature.split(".")
# Nested field or a normal field?
if len(input_feature_path) > 1:
texts = [Helpers.traverse_nested_dict_by_keys(document, input_feature_path) for document in documents]
else:
texts = [document[input_feature] if input_feature in document else "" for document in documents]
data = {'texts': json.dumps(texts, ensure_ascii=False)}
analyzation_data, errors = MLPTaskAdapter(self.mlp_url, mlp_type='mlp_lite').process(data)
for analyzation_idx, analyzation_datum in enumerate(analyzation_data):
documents[analyzation_idx][input_feature + '_mlp-lite'] = {}
documents[analyzation_idx][input_feature + '_mlp-lite']['text'] = analyzation_datum['text']
if output_type == 'full':
documents[analyzation_idx][input_feature + '_mlp-lite']['stats'] = self._process_stats(analyzation_datum['stats'])
# Is it a nested field or a normal field?
if len(input_feature_path) > 1:
# Make sure the last field is used as the path.
mlp_field_path = input_feature_path[:-1] + [input_feature_path[-1] + "_mlp-lite"]
Helpers.set_in_dict(documents[analyzation_idx], mlp_field_path, {})
mlp_text_path = mlp_field_path + ["text"]
Helpers.set_in_dict(documents[analyzation_idx], mlp_text_path, analyzation_datum['text'])
if output_type == 'full':
mlp_stats_path = mlp_field_path + ["stats"]
Helpers.set_in_dict(documents[analyzation_idx], mlp_stats_path, self._process_stats(analyzation_datum["stats"]))
else:
documents[analyzation_idx][input_feature + '_mlp-lite'] = {}
documents[analyzation_idx][input_feature + '_mlp-lite']['text'] = analyzation_datum['text']
if output_type == 'full':
documents[analyzation_idx][input_feature + '_mlp-lite']['stats'] = self._process_stats(analyzation_datum['stats'])
return {'documents': documents, 'meta': {}, 'erros': errors}
......@@ -30,19 +30,6 @@ mlp_field_properties = {
}
}
},
'lemmas': {
'type': 'text',
'fields': {
'keyword': {
'type': 'keyword',
'ignore_above': 256
}
}
},
'lang': {
'type': 'keyword',
'ignore_above': 256
}
}
}
......@@ -72,7 +59,7 @@ except Exception as e:
log_preprocessor_status(code='mlp_lite', status='disabled')
try:
preprocessor_map['mlp_lite'] = {
preprocessor_map['mlp-lite'] = {
'name': 'MLP Lite',
'description': 'Extracts lemmas and calculates statistics for classifiers.',
'class': MLPLitePreprocessor,
......
......@@ -5,7 +5,7 @@
{% if fields %}
<select class="selectpicker" data-width="100%" multiple="multiple" id="text-cleaner-feature-names"
name="mlp_lite_feature_names" data-style="btn-sm btn-default">
name="mlp-lite_feature_names" data-style="btn-sm btn-default">
{% for field in fields %}
<option value="{{ field.data }}"
{% if forloop.counter < 2 %}selected="selected"{% endif %}>{{ field.label }}</option>
......@@ -14,7 +14,7 @@
{% else %}
<textarea class="form-control" rows="2" id="text-cleaner-feature-names"
name="mlp_lite_feature_names"></textarea>
name="mlp-lite_feature_names"></textarea>
{% endif %}
</div>
......@@ -23,7 +23,7 @@
<label class="control-label" for="text-cleaner-output-type">Output:</label>
<div class="select-wrapper">
<select class="selectpicker" data-width="100%" id="text-cleaner-output-type"
name="mlp_lite_output_type" data-style="btn-sm btn-default">
name="mlp-lite_output_type" data-style="btn-sm btn-default">
{% for output_type in preprocessor.output_type %}
<option value="{{ output_type }}"
{% if forloop.counter < 2 %}selected="selected"{% endif %}>{{ output_type }}</option>
......
......@@ -53,7 +53,7 @@ class Helpers:
chunks = Helpers.chunks(list_of_texts, chunk_size=chunk_size)
for chunk in chunks:
input_data = {"texts": json.dumps(chunk, ensure_ascii=False), "doc_path": data["doc_path"]}
input_data = {"texts": json.dumps(chunk, ensure_ascii=False), "doc_path": data.get("doc_path", None)}
tasks_data.append(input_data)
return tasks_data
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment