Commit cda8f021 authored by Marko Kollo's avatar Marko Kollo
Browse files

Fixed search handling nested fields.

parent d560b44c
import json
import time
import warnings
from collections import OrderedDict, defaultdict
import dictor
from bs4 import BeautifulSoup
from utils.highlighter import Highlighter, ColorPicker
from searcher.view_functions.general.searcher_utils import additional_option_cut_text
from searcher.view_functions.build_search.translit_highlighting import hl_transliterately
from searcher.view_functions.general.searcher_utils import improve_facts_readability
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
from searcher.view_functions.general.searcher_utils import additional_option_cut_text, improve_facts_readability
from texta.settings import FACT_FIELD
import time
import json
from utils.generic_helpers import extract_element_from_json
from utils.highlighter import ColorPicker, Highlighter
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
def execute_search(es_m, es_params):
start_time = time.time()
out = {'column_names': [],'aaData': [],'iTotalRecords': 0,'iTotalDisplayRecords': 0,'lag': 0}
out = {'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0}
# DEFINING THE EXAMPLE SIZE
es_m.set_query_parameter('from', es_params['examples_start'])
es_m.set_query_parameter('size', es_params['num_examples'])
......@@ -25,23 +27,23 @@ def execute_search(es_m, es_params):
es_m.set_query_parameter('highlight', hl_config)
response = es_m.search()
out['iTotalRecords'] = response['hits']['total']
out['iTotalDisplayRecords'] = response['hits']['total'] # number of docs
out['iTotalDisplayRecords'] = response['hits']['total'] # number of docs
if int(out['iTotalDisplayRecords']) > 10000: # Allow less pages if over page limit
if int(out['iTotalDisplayRecords']) > 10000: # Allow less pages if over page limit
out['iTotalDisplayRecords'] = '10000'
out['column_names'] = es_m.get_column_names(facts=True) # get columns names from ES mapping
out['column_names'] = es_m.get_column_names(facts=True) # get columns names from ES mapping
strip_html = True
if 'html_stripping' in es_params:
strip_html = False
hits = response['hits']['hits']
#hits = es_m.remove_html_from_hits(hits)
# hits = es_m.remove_html_from_hits(hits)
counter = 0
for hit in hits:
hit_id = str(hit['_id'])
hit['_source']['_es_id'] = hit_id
row = OrderedDict([(x, '') for x in out['column_names']]) # OrderedDict to remember column names with their content
row = OrderedDict([(x, '') for x in out['column_names']]) # OrderedDict to remember column names with their content
inner_hits = hit['inner_hits'] if 'inner_hits' in hit else {}
name_to_inner_hits = _derive_name_to_inner_hits(inner_hits)
......@@ -69,21 +71,11 @@ def execute_search(es_m, es_params):
if dictor.dictor(content, col, default=""):
content = dictor.dictor(content, col, default="")
else:
content = extract_element_from_json(content, field_path)
content = [str(value) for value in content if value is not None]
content = "\n".join(content) if content else None
possible_list = dictor.dictor(content, ".".join(field_path[:-2]), default="")
if isinstance(possible_list, list) and not isinstance(possible_list[0], dict, default=""): # It's a normal list field
content = possible_list
elif isinstance(possible_list, list) and isinstance(possible_list[0], dict): # It's an array of objects.
list_of_items = dictor.dictor(content, ".".join(field_path[:-1]), default="")
dictionary_key = field_path[-1]
values = [item[dictionary_key] for item in list_of_items]
content = "\n".join(values)
else: # Field is missing
content = None
content = str(content)
content = str(content) if content else ""
if strip_html:
soup = BeautifulSoup(content, "lxml")
......@@ -98,7 +90,7 @@ def execute_search(es_m, es_params):
# Prettify and standardize highlights
content, hl_data = _prettify_standardize_hls(name_to_inner_hits, col, content, old_content)
# Append the final content of this col to the row
if(row[col] == ''):
if (row[col] == ''):
row[col] = content
cols_data[col] = {'highlight_data': hl_data, 'content': content, 'old_content': old_content}
......@@ -111,8 +103,8 @@ def execute_search(es_m, es_params):
for col in row:
row[col] = additional_option_cut_text(row[col], es_params['short_version_n_char'], count=counter)
out['aaData'].append([hit_id] + list(row.values()))
out['lag'] = time.time()-start_time
counter +=1
out['lag'] = time.time() - start_time
counter += 1
return out
......@@ -138,10 +130,10 @@ def _prettify_standardize_hls(name_to_inner_hits, col, content, old_content):
hl_data.append(datum)
content = Highlighter(average_colors=True, derive_spans=True,
additional_style_string='font-weight: bold;').highlight(
str(old_content),
hl_data,
tagged_text=str(content))
additional_style_string='font-weight: bold;').highlight(
str(old_content),
hl_data,
tagged_text=str(content))
return content, hl_data
......@@ -159,7 +151,7 @@ def _derive_hl_config(es_params):
post_tag = "</span>"
hl_config = {"fields": {}, "pre_tags": [pre_tag], "post_tags": [post_tag]}
for field in es_params:
if 'match_field' in field and es_params['match_operator_'+field.split('_')[-1]] != 'must_not':
if 'match_field' in field and es_params['match_operator_' + field.split('_')[-1]] != 'must_not':
f = es_params[field]
for sub_f in f.split(','):
hl_config['fields'][sub_f] = {"number_of_fragments": 0}
......
......@@ -17,3 +17,61 @@ def find_key_recursivly(key: str, dictionary: dict):
if isinstance(d, dict):
for result in find_key_recursivly(key, d):
yield result
def extract_element_from_json(obj, path):
"""
Extracts an element from a nested dictionary or
a list of nested dictionaries along a specified path.
If the input is a dictionary, a list is returned.
If the input is a list of dictionary, a list of lists is returned.
obj - list or dict - input dictionary or list of dictionaries
path - list - list of strings that form the path to the desired element
"""
def extract(obj, path, ind, arr):
"""
Extracts an element from a nested dictionary
along a specified path and returns a list.
obj - dict - input dictionary
path - list - list of strings that form the JSON path
ind - int - starting index
arr - list - output list
"""
key = path[ind]
if ind + 1 < len(path):
if isinstance(obj, dict):
if key in obj.keys():
extract(obj.get(key), path, ind + 1, arr)
else:
arr.append(None)
elif isinstance(obj, list):
if not obj:
arr.append(None)
else:
for item in obj:
extract(item, path, ind, arr)
else:
arr.append(None)
if ind + 1 == len(path):
if isinstance(obj, list):
if not obj:
arr.append(None)
else:
for item in obj:
arr.append(item.get(key, None))
elif isinstance(obj, dict):
arr.append(obj.get(key, None))
else:
arr.append(None)
return arr
if isinstance(obj, dict):
return extract(obj, path, 0, [])
elif isinstance(obj, list):
outer_arr = []
for item in obj:
outer_arr.append(extract(item, path, 0, []))
return outer_arr
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment