Commit f3abf69f authored by Gert Paimla's avatar Gert Paimla
Browse files

fix word wrap in constraints

parents 3475a022 3d8cdbbc
1.6.70
\ No newline at end of file
1.6.70
......@@ -31,3 +31,4 @@ dependencies:
- elasticsearch_dsl
- graypy
- python-dotenv
- dictor
import logging
from pprint import pprint
from searcher.dashboard.metafile import BaseDashboardFormater
from typing import Dict, List
from searcher.dashboard.metafile import BaseDashboardFormater
from texta.settings import ERROR_LOGGER
......@@ -39,6 +37,7 @@ class MultiSearchFormater(BaseDashboardFormater):
return final_result
def _format_initial_response(self, response):
"""
Because MultiSearch does not contain a single response, but many it was
......@@ -58,6 +57,7 @@ class MultiSearchFormater(BaseDashboardFormater):
return final_result
def _add_value_count_percentages(self, aggregation_dict: dict, total_document_count: int, field_counts: dict):
"""
Traverses the previously grouped dictionary of ES aggregations, loops through the value_count
......@@ -71,13 +71,14 @@ class MultiSearchFormater(BaseDashboardFormater):
aggregation_dict["value_count"] = {}
for field_name, doc_count in field_counts.items():
percentage = round( doc_count * 100 / total_document_count, 2)
aggregation_dict["value_count"][field_name] = {'doc_count' : doc_count, 'percentage': percentage}
percentage = round(doc_count * 100 / total_document_count, 2)
aggregation_dict["value_count"][field_name] = {'doc_count': doc_count, 'percentage': percentage}
except ZeroDivisionError as e:
logging.getLogger(ERROR_LOGGER).exception(e)
def _format_aggregation_dict(self, agg_dict: dict):
"""
Taking the aggregation results of a single index, format it into the
......@@ -95,6 +96,7 @@ class MultiSearchFormater(BaseDashboardFormater):
if 'nested' not in field_name:
agg_type, field_name, bucket_suffix = field_name.split('#')
else:
agg_type, field_name, bucket_suffix = ('nested', 'texta_facts', '')
......
......@@ -47,40 +47,40 @@ class MultiSearchConductor:
clean_field_name = self._remove_dot_notation(field_name)
search_gateway = elasticsearch_dsl.Search(index=index).using(es)
self.field_counts[field_name] = search_gateway.query("exists", field=clean_field_name).count()
self.field_counts[clean_field_name] = search_gateway.query("exists", field=clean_field_name).count()
# Do not play around with the #, they exist to avoid naming conflicts as awkward as they may be.
# TODO Find a better solution for this.
if field_type == "text":
if query_body is not None:
search_dsl = self._create_search_object(query_body=query_body, index=index, es=es)
search_dsl.aggs.bucket("sigsterms#{0}#text_sigterms".format(field_name), 'significant_text', field=field_name, filter_duplicate_text=True)
search_dsl.aggs.bucket("sigsterms#{0}#text_sigterms".format(clean_field_name), 'significant_text', field=field_name, filter_duplicate_text=True)
self.multi_search = self.multi_search.add(search_dsl)
elif field_type == "keyword":
search_dsl = self._create_search_object(query_body=query_body, index=index, es=es)
search_dsl.aggs.bucket("sterms#{0}#keyword_terms".format(field_name), 'terms', field=field_name)
search_dsl.aggs.bucket("sterms#{0}#keyword_terms".format(clean_field_name), 'terms', field=field_name)
self.multi_search = self.multi_search.add(search_dsl)
elif field_type == "date":
search_dsl = self._create_search_object(query_body=query_body, index=index, es=es)
search_dsl.aggs.bucket("date_histogram#{0}_month#date_month".format(field_name), 'date_histogram', field=field_name, interval='month')
search_dsl.aggs.bucket("date_histogram#{0}_year#date_year".format(field_name), 'date_histogram', field=field_name, interval='year')
search_dsl.aggs.bucket("date_histogram#{0}_month#date_month".format(clean_field_name), 'date_histogram', field=field_name, interval='month')
search_dsl.aggs.bucket("date_histogram#{0}_year#date_year".format(clean_field_name), 'date_histogram', field=field_name, interval='year')
self.multi_search = self.multi_search.add(search_dsl)
elif field_type == "integer":
search_dsl = self._create_search_object(query_body=query_body, index=index, es=es)
search_dsl.aggs.bucket("extended_stats#{0}#int_stats".format(field_name), 'extended_stats', field=field_name)
search_dsl.aggs.bucket("extended_stats#{0}#int_stats".format(clean_field_name), 'extended_stats', field=field_name)
self.multi_search = self.multi_search.add(search_dsl)
elif field_type == "long":
search_dsl = self._create_search_object(query_body=query_body, index=index, es=es)
search_dsl.aggs.bucket('extended_stats#{0}#long_stats'.format(field_name), 'extended_stats', field=field_name)
search_dsl.aggs.bucket('extended_stats#{0}#long_stats'.format(clean_field_name), 'extended_stats', field=field_name)
self.multi_search = self.multi_search.add(search_dsl)
elif field_type == "float":
search_dsl = self._create_search_object(query_body=query_body, index=index, es=es)
search_dsl.aggs.bucket("extended_stats#{0}#float_stats".format(field_name), 'extended_stats', field=field_name)
search_dsl.aggs.bucket("extended_stats#{0}#float_stats".format(clean_field_name), 'extended_stats', field=field_name)
self.multi_search = self.multi_search.add(search_dsl)
def _texta_facts_agg_handler(self, query_body, index, es):
......@@ -106,7 +106,7 @@ class MultiSearchConductor:
:return: Name of the field but the comma removed. ex article_lead
"""
if '.' in field_name:
field_name = field_name.split('.')[0]
field_name = field_name.replace(".keyword", "")
return field_name
else:
return field_name
......
import json
import time
import warnings
from collections import OrderedDict, defaultdict
from utils.highlighter import Highlighter, ColorPicker
from searcher.view_functions.general.searcher_utils import additional_option_cut_text
from searcher.view_functions.build_search.translit_highlighting import hl_transliterately
from searcher.view_functions.general.searcher_utils import improve_facts_readability
import dictor
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
from searcher.view_functions.build_search.translit_highlighting import hl_transliterately
from searcher.view_functions.general.searcher_utils import additional_option_cut_text, improve_facts_readability
from texta.settings import FACT_FIELD
import time
import json
from utils.generic_helpers import extract_element_from_json
from utils.highlighter import ColorPicker, Highlighter
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
def execute_search(es_m, es_params):
start_time = time.time()
out = {'column_names': [],'aaData': [],'iTotalRecords': 0,'iTotalDisplayRecords': 0,'lag': 0}
out = {'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0}
# DEFINING THE EXAMPLE SIZE
es_m.set_query_parameter('from', es_params['examples_start'])
es_m.set_query_parameter('size', es_params['num_examples'])
......@@ -22,23 +27,23 @@ def execute_search(es_m, es_params):
es_m.set_query_parameter('highlight', hl_config)
response = es_m.search()
out['iTotalRecords'] = response['hits']['total']
out['iTotalDisplayRecords'] = response['hits']['total'] # number of docs
out['iTotalDisplayRecords'] = response['hits']['total'] # number of docs
if int(out['iTotalDisplayRecords']) > 10000: # Allow less pages if over page limit
if int(out['iTotalDisplayRecords']) > 10000: # Allow less pages if over page limit
out['iTotalDisplayRecords'] = '10000'
out['column_names'] = es_m.get_column_names(facts=True) # get columns names from ES mapping
out['column_names'] = es_m.get_column_names(facts=True) # get columns names from ES mapping
strip_html = True
if 'html_stripping' in es_params:
strip_html = False
hits = response['hits']['hits']
#hits = es_m.remove_html_from_hits(hits)
# hits = es_m.remove_html_from_hits(hits)
counter = 0
for hit in hits:
hit_id = str(hit['_id'])
hit['_source']['_es_id'] = hit_id
row = OrderedDict([(x, '') for x in out['column_names']]) # OrderedDict to remember column names with their content
row = OrderedDict([(x, '') for x in out['column_names']]) # OrderedDict to remember column names with their content
inner_hits = hit['inner_hits'] if 'inner_hits' in hit else {}
name_to_inner_hits = _derive_name_to_inner_hits(inner_hits)
......@@ -51,13 +56,26 @@ def execute_search(es_m, es_params):
field_path = col.split('.')
# Get content for the fields and make facts human readable
# Possible outcomes for a field:
# Normal field value - covered by dictor.
# Object field value - covered by dictor
# List of normal values - check for list and element type
# List of objects - check for list and dict element, get the key values.
content = hit['_source']
if col == FACT_FIELD and col in hit['_source']:
content = improve_facts_readability(hit['_source'][col])
else:
for p in field_path:
content = content[p] if p in content else ''
content = str(content)
# When the value of the field is a normal field or object field.
if dictor.dictor(content, col, default=""):
content = dictor.dictor(content, col, default="")
else:
content = extract_element_from_json(content, field_path)
content = [str(value) for value in content if value is not None]
content = "\n".join(content) if content else None
content = str(content) if content else ""
if strip_html:
soup = BeautifulSoup(content, "lxml")
......@@ -72,7 +90,7 @@ def execute_search(es_m, es_params):
# Prettify and standardize highlights
content, hl_data = _prettify_standardize_hls(name_to_inner_hits, col, content, old_content)
# Append the final content of this col to the row
if(row[col] == ''):
if (row[col] == ''):
row[col] = content
cols_data[col] = {'highlight_data': hl_data, 'content': content, 'old_content': old_content}
......@@ -85,8 +103,8 @@ def execute_search(es_m, es_params):
for col in row:
row[col] = additional_option_cut_text(row[col], es_params['short_version_n_char'], count=counter)
out['aaData'].append([hit_id] + list(row.values()))
out['lag'] = time.time()-start_time
counter +=1
out['lag'] = time.time() - start_time
counter += 1
return out
......@@ -112,14 +130,14 @@ def _prettify_standardize_hls(name_to_inner_hits, col, content, old_content):
hl_data.append(datum)
content = Highlighter(average_colors=True, derive_spans=True,
additional_style_string='font-weight: bold;').highlight(
str(old_content),
hl_data,
tagged_text=str(content))
additional_style_string='font-weight: bold;').highlight(
str(old_content),
hl_data,
tagged_text=str(content))
return content, hl_data
def _transliterate(cols_data, row, translit_cols=['text', 'translit', 'lemmas']):
def _transliterate(cols_data, row, translit_cols=['text', 'translit', 'lemmas']):
# To get nested col value before '.'
hl_cols = [x for x in cols_data if len(x.split('.')) > 1 and x.split('.')[-1] in translit_cols]
# Transliterate the highlighting between hl_cols
......@@ -133,7 +151,7 @@ def _derive_hl_config(es_params):
post_tag = "</span>"
hl_config = {"fields": {}, "pre_tags": [pre_tag], "post_tags": [post_tag]}
for field in es_params:
if 'match_field' in field and es_params['match_operator_'+field.split('_')[-1]] != 'must_not':
if 'match_field' in field and es_params['match_operator_' + field.split('_')[-1]] != 'must_not':
f = es_params[field]
for sub_f in f.split(','):
hl_config['fields'][sub_f] = {"number_of_fragments": 0}
......
......@@ -100,8 +100,7 @@ def get_fields(es_m):
path_list = path.split('.')
label = '{0} --> {1}'.format(path_list[0], path_list[-1]) if len(path_list) > 1 else path_list[0]
label = label.replace('-->', u'→')
label = u'→'.join(path_list)
if data['type'] == 'date':
data['range'] = get_daterange(es_m, path)
......@@ -166,7 +165,7 @@ def dashboard_visualize(request):
indices = es_params.get("chosen_index", None).split(',')
for i in range(len(indices)):
indices[i] = indices[i].replace('.', '-')
indices[i] = indices[i].replace('.', '-').replace("*", "WILDCARD")
color_setting = request.POST['dashboard-color']
color_max = request.POST['dashboard-color-maximum']
......
......@@ -261,6 +261,6 @@ function swalCustomTypeDisplay(swalType, title, text) {
title: title,
text: text,
heightAuto: false,
type: swalType
type: swalType,
})
}
......@@ -47,7 +47,7 @@ function removeLoader() {
function createIndices(indicesArray, data) {
data.indices.forEach((element) => {
indicesArray.push(new Index(element.aggregations, element.index_name.replace('.', '-'), element.total_documents))
indicesArray.push(new Index(element.aggregations, element.index_name.replace('.', '-').replace("*", "WILDCARD"), element.total_documents))
});
return indicesArray
}
......
......@@ -61,28 +61,30 @@ class MlpPreprocessor(object):
# This part is under a try catch because it's an notorious trouble maker.
try:
analyzation_datum = analyzation_datum[0]
except Exception as e:
logging.getLogger(ERROR_LOGGER).exception(analyzation_datum)
input_feature_path = input_feature.split(".")
if len(input_feature) == 1:
documents[analyzation_idx][input_feature + '_mlp'] = analyzation_datum['text']
documents[analyzation_idx][input_feature + '_mlp']['lang'] = analyzation_datum['text']['lang']
if 'texta_facts' not in documents[analyzation_idx]:
documents[analyzation_idx]['texta_facts'] = []
documents[analyzation_idx]['texta_facts'].extend(analyzation_datum['texta_facts'])
input_feature_path = input_feature.split(".")
if len(input_feature) == 1:
documents[analyzation_idx][input_feature + '_mlp'] = analyzation_datum['text']
documents[analyzation_idx][input_feature + '_mlp']['lang'] = analyzation_datum['text']['lang']
if 'texta_facts' not in documents[analyzation_idx]:
documents[analyzation_idx]['texta_facts'] = []
documents[analyzation_idx]['texta_facts'].extend(analyzation_datum['texta_facts'])
else:
# Make sure the last field is used as the path.
mlp_field_path = input_feature_path[:-1] + [input_feature_path[-1] + "_mlp"]
Helpers.set_in_dict(documents[analyzation_idx], mlp_field_path, analyzation_datum['text'])
else:
# Make sure the last field is used as the path.
mlp_field_path = input_feature_path[:-1] + [input_feature_path[-1] + "_mlp"]
Helpers.set_in_dict(documents[analyzation_idx], mlp_field_path, analyzation_datum['text'])
lang_path = mlp_field_path + ["lang"]
Helpers.set_in_dict(documents[analyzation_idx], lang_path, analyzation_datum['text']['lang'])
lang_path = mlp_field_path + ["lang"]
Helpers.set_in_dict(documents[analyzation_idx], lang_path, analyzation_datum['text']['lang'])
if 'texta_facts' not in documents[analyzation_idx]:
documents[analyzation_idx]["texta_facts"] = []
if 'texta_facts' not in documents[analyzation_idx]:
documents[analyzation_idx]["texta_facts"] = []
documents[analyzation_idx]["texta_facts"].extend(analyzation_datum["texta_facts"])
documents[analyzation_idx]["texta_facts"].extend(analyzation_datum["texta_facts"])
except Exception as e:
logging.getLogger(ERROR_LOGGER).exception("Error: {}, Document ID: {}".format(e, documents[analyzation_idx]))
continue
return {'documents': documents, 'meta': {}, 'errors': errors}
import json
import logging
from texta.settings import ERROR_LOGGER
from utils.mlp_task_adapter import MLPTaskAdapter
from utils.mlp_task_adapter import Helpers
......@@ -78,24 +80,29 @@ class MLPLitePreprocessor(object):
analyzation_data, errors = MLPTaskAdapter(self.mlp_url, mlp_type='mlp_lite').process(data)
for analyzation_idx, analyzation_datum in enumerate(analyzation_data):
# Because for some whatever reason, at times this will be None
# If it happens, ignore it, log it, and move on with life.
try:
# Is it a nested field or a normal field?
if len(input_feature_path) > 1:
# Make sure the last field is used as the path.
mlp_field_path = input_feature_path[:-1] + [input_feature_path[-1] + "_mlp-lite"]
Helpers.set_in_dict(documents[analyzation_idx], mlp_field_path, {})
mlp_text_path = mlp_field_path + ["text"]
Helpers.set_in_dict(documents[analyzation_idx], mlp_text_path, analyzation_datum['text'])
if output_type == 'full':
mlp_stats_path = mlp_field_path + ["stats"]
Helpers.set_in_dict(documents[analyzation_idx], mlp_stats_path, self._process_stats(analyzation_datum["stats"]))
# Is it a nested field or a normal field?
if len(input_feature_path) > 1:
# Make sure the last field is used as the path.
mlp_field_path = input_feature_path[:-1] + [input_feature_path[-1] + "_mlp-lite"]
Helpers.set_in_dict(documents[analyzation_idx], mlp_field_path, {})
mlp_text_path = mlp_field_path + ["text"]
Helpers.set_in_dict(documents[analyzation_idx], mlp_text_path, analyzation_datum['text'])
if output_type == 'full':
mlp_stats_path = mlp_field_path + ["stats"]
Helpers.set_in_dict(documents[analyzation_idx], mlp_stats_path, self._process_stats(analyzation_datum["stats"]))
else:
documents[analyzation_idx][input_feature + '_mlp-lite'] = {}
documents[analyzation_idx][input_feature + '_mlp-lite']['text'] = analyzation_datum['text']
if output_type == 'full':
documents[analyzation_idx][input_feature + '_mlp-lite']['stats'] = self._process_stats(analyzation_datum['stats'])
else:
documents[analyzation_idx][input_feature + '_mlp-lite'] = {}
documents[analyzation_idx][input_feature + '_mlp-lite']['text'] = analyzation_datum['text']
if output_type == 'full':
documents[analyzation_idx][input_feature + '_mlp-lite']['stats'] = self._process_stats(analyzation_datum['stats'])
except Exception as e:
logging.getLogger(ERROR_LOGGER).exception("Error Message: {}, Document: {}".format(e, documents[analyzation_idx]))
continue
return {'documents': documents, 'meta': {}, 'erros': errors}
import sys
from datetime import datetime
import json
import os
......@@ -24,7 +25,7 @@ from task_manager.document_preprocessor import PREPROCESSOR_INSTANCES
class PreprocessorWorker(BaseWorker):
def __init__(self, scroll_size=200, time_out='10m'):
def __init__(self, scroll_size=100, time_out='50m'):
self.es_m = None
self.task_id = None
self.params = None
......@@ -34,12 +35,14 @@ class PreprocessorWorker(BaseWorker):
self._reload_env()
self.info_logger, self.error_logger = self._generate_loggers()
def _reload_env(self):
from dotenv import load_dotenv
from pathlib import Path
env_path = str(Path('.env'))
load_dotenv(dotenv_path=env_path)
def _generate_loggers(self):
import graypy
import os
......@@ -52,6 +55,7 @@ class PreprocessorWorker(BaseWorker):
return info_logger, error_logger
def run(self, task_id):
self.task_id = task_id
task = Task.objects.get(pk=self.task_id)
......@@ -86,6 +90,7 @@ class PreprocessorWorker(BaseWorker):
task.result = json.dumps({'error': repr(e)})
task.update_status(Task.STATUS_FAILED, set_time_completed=True)
def _preprocessor_worker(self):
field_paths = []
show_progress = ShowProgress(self.task_id)
......@@ -132,11 +137,21 @@ class PreprocessorWorker(BaseWorker):
show_progress.update(total_hits)
self.es_m.update_documents_by_id(doc_ids)
# Get next page if any
response = self.es_m.scroll(scroll_id=scroll_id, time_out=self.scroll_time_out)
total_hits = len(response['hits']['hits'])
scroll_id = response['_scroll_id']
# For partial update
doc_ids = [x['_id'] for x in response['hits']['hits'] if '_id' in x]
try:
response = self.es_m.scroll(scroll_id=scroll_id, time_out=self.scroll_time_out)
total_hits = len(response['hits']['hits'])
scroll_id = response['_scroll_id']
# For partial update
doc_ids = [x['_id'] for x in response['hits']['hits'] if '_id' in x]
except KeyError as e:
t, v, tb = sys.exc_info()
self.error_logger.exception(t)
self.error_logger.exception(v)
self.error_logger.exception(tb)
self.error_logger.exception(response)
raise e
task = Task.objects.get(pk=self.task_id)
show_progress.update(100)
......@@ -156,6 +171,7 @@ class PreprocessorWorker(BaseWorker):
task.time_completed = datetime.now()
task.save()
def _prepare_preprocessor_data(self, response: dict):
"""
Seperates document dicts and id strings from the pure ES response and changes
......@@ -178,6 +194,7 @@ class PreprocessorWorker(BaseWorker):
return documents, parameter_dict, ids, document_locations
@staticmethod
def _parse_query(parameters):
"""
......@@ -196,6 +213,7 @@ class PreprocessorWorker(BaseWorker):
query = json.loads(Search.objects.get(pk=int(search)).query)
return query
@staticmethod
def _check_if_request_bad(args):
'''Check if models/fields are selected'''
......
import time
from functools import wraps
from django.http import HttpResponseRedirect
from texta.settings import SERVER_TYPE
......@@ -10,3 +13,52 @@ def HTTPS_ResponseRedirect(request, url):
absolute_url = request.build_absolute_uri(url)
new_url = "https%s" % absolute_url[4:]
return HttpResponseRedirect(new_url)
def retry(ExceptionToCheck, tries=4, delay=3, backoff=2, logger=None):
"""Retry calling the decorated function using an exponential backoff.
http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry
:param ExceptionToCheck: the exception to check. may be a tuple of
exceptions to check
:type ExceptionToCheck: Exception or tuple
:param tries: number of times to try (not retry) before giving up
:type tries: int
:param delay: initial delay between retries in seconds
:type delay: int
:param backoff: backoff multiplier e.g. value of 2 will double the delay
each retry
:type backoff: int
:param logger: logger to use. If None, print
:type logger: logging.Logger instance
"""
def deco_retry(f):
@wraps(f)
def f_retry(*args, **kwargs):
mtries, mdelay = tries, delay
while mtries > 1:
try:
return f(*args, **kwargs)
except ExceptionToCheck as e:
msg = "%s, Retrying in %d seconds..." % (str(e), mdelay)
if logger:
logger.warning(msg)
else:
print(msg)
time.sleep(mdelay)
mtries -= 1
mdelay *= backoff
return f(*args, **kwargs)
return f_retry # true decorator
return deco_retry
......@@ -17,3 +17,61 @@ def find_key_recursivly(key: str, dictionary: dict):
if isinstance(d, dict):
for result in find_key_recursivly(key, d):
yield result
def extract_element_from_json(obj, path):
"""
Extracts an element from a nested dictionary or
a list of nested dictionaries along a specified path.
If the input is a dictionary, a list is returned.
If the input is a list of dictionary, a list of lists is returned.
obj - list or dict - input dictionary or list of dictionaries
path - list - list of strings that form the path to the desired element
"""
def extract(obj, path, ind, arr):
"""
Extracts an element from a nested dictionary
along a specified path and returns a list.
obj - dict - input dictionary
path - list - list of strings that form the JSON path
ind - int - starting index
arr - list - output list