Source code for hansken.query

# encoding=utf-8

from collections.abc import Iterable, Sized
from datetime import date, datetime
import json
import re
import warnings

from logbook import Logger

from hansken.util import format_datetime, GeographicLocation, omit_empty, Vector


log = Logger(__name__)


# 'known' default maximum number of clauses in a (boolean) query
DEFAULT_MAX_CLAUSE_COUNT = 1024


def _format_value(value):
    """
    Formats a value suitable for Hansken Query Language:

    - date, datetime: ISO 8601
    - GeographicLocation: ISO 6709 string
    - tuple of numbers, length 2: assumed to be latlong, formatted as an
      ISO 6709 string.
    - others: no change
    """
    # order is significant here, date is a datetime but date is allowed without a timezone
    if isinstance(value, datetime):
        return format_datetime(value)
    if isinstance(value, date):
        return value.isoformat()

    if isinstance(value, GeographicLocation):
        return str(value)
    if isinstance(value, (tuple, list)) and len(value) == 2 and all(isinstance(part, (float, int)) for part in value):
        return str(GeographicLocation(*value))

    return value


def _flatten(q_type, *queries):
    """
    Yields queries or contained clauses if a query is of type q_type.

    :param q_type: (iterable) type of query to flatten
    :param queries: queries to process
    :return: generator yielding queries
    """
    for q in queries:
        if isinstance(q, q_type):
            # flatten the top level only
            for clause in q:
                yield clause
        else:
            yield q


def _parse_scale(scale):
    """
    Parses a scale string into a scale and base or interval as defined by the
    Facet type of the Hansken search request. Parses a scale such as 'log2',
    'log10' or 'linear@1024'.

    :param scale: a scale string
    :return: (scale, arg)
    """
    base = None
    interval = None
    if scale and scale.startswith('log') and len(scale) > len('log'):
        base = int(scale[len('log'):])
        scale = 'log'
    elif scale and scale.startswith('linear@'):
        interval = int(scale[len('linear@'):])
        scale = 'linear'

    return scale, base, interval


[docs] class Query: """ Base class for Hansken query types. Implementations are required to implement `.as_dict` for transformation to wire format. """
[docs] def as_dict(self): """ Turns this query into a dict as specified by the Hansken Query Language Specification. """ raise NotImplementedError()
[docs] def __and__(self, other): """ Binary and operator (``&``) handling, resulting in an `.And` query. Resulting query is flattened when one or more operands are already `.And` queries. """ if not isinstance(other, Query): raise TypeError('right hand operand not Query') return And(*_flatten(And, self, other))
[docs] def __or__(self, other): """ Binary or operator (``|``) handling, resulting in an `.Or` query. Resulting query is flattened when one or more operands are already `.Or` queries. """ if not isinstance(other, Query): raise TypeError('right hand operand not Query') return Or(*_flatten(Or, self, other))
[docs] def __invert__(self): """ Binary not operator (``~``) handling, resulting in a `.Not` query. """ return Not(self)
def __str__(self): """ Encodes this query as a JSON string. """ return json.dumps(self.as_dict())
[docs] class And(Sized, Iterable, Query): """ Boolean conjunction of multiple queries; traces should match all contained queries, for example: .. code-block:: python And(Term('file.name', 'query.py'), Range('data.raw.size', min=512)) """
[docs] def __init__(self, *queries): self.queries = set(queries)
[docs] def as_dict(self): return {'and': [query.as_dict() for query in self.queries]}
def __len__(self): return len(self.queries) def __iter__(self): return iter(self.queries)
[docs] class Or(Sized, Iterable, Query): """ Boolean disjunction of multiple queries, traces should match any contained query, for example: .. code-block:: python Or(Term('file.name', 'query.py'), Range('data.raw.size', max=1024)) """
[docs] def __init__(self, *queries): self.queries = set(queries)
[docs] def as_dict(self): return {'or': [query.as_dict() for query in self.queries]}
def __len__(self): return len(self.queries) def __iter__(self): return iter(self.queries)
[docs] class Not(Query): """ Negates a single query, for example: .. code-block:: python Not(Term('file.name', 'query.py')) """
[docs] def __init__(self, query): self.query = query
[docs] def as_dict(self): return {'not': self.query.as_dict()}
[docs] class Nested(Query): """ Query a field for values matching the results of another query, for example: .. code-block:: python Nested('data.raw.hash.md5', Term('file.name', 'query.py')) """
[docs] def __init__(self, field, query): self.field = field self.query = query
[docs] def as_dict(self): return {'nested': { 'field': self.field, 'query': self.query.as_dict() }}
[docs] class Tracelet(Query): """ Restrict a query for a tracelet type to the same tracelet instance of that tracelet type. .. code-block:: python # find traces containing an entity Tracelet('entity') # find traces containing an entity that has both: # - a value starting with "http://" # - a confidence of at least 0.9 Tracelet('entity', Term('entity.value', 'http://*', full=True) & Range('entity.confidence', min=0.9)) Note that without the `Tracelet` query, the `Term` and `Range` queries above could match different entities, ultimately matching traces that contain *any* entity with a value starting with ``http://`` and *any* entity with a confidence of at least 0.9 (not necessarily to the same entity). """
[docs] def __init__(self, tracelet_type, query=None): self.tracelet_type = tracelet_type self.query = query
[docs] def as_dict(self): return {'hasTracelet': { 'type': self.tracelet_type, # default to an empty dict, translating to an any-query (match any tracelet instance) 'query': self.query.as_dict() if self.query else {} }}
[docs] class Trace(Query): """ Restrict a tracelet query to tracelets belonging to traces matching the inner query. .. code-block:: python # match entities of type iban, but only if the trace they belong to is from a specific image Term('entity.type', 'iban') & Trace(Term('image', '1234-abcd')) """
[docs] def __init__(self, query): self.query = query
[docs] def as_dict(self): return {'hasTrace': { 'query': self.query.as_dict() }}
[docs] class Term(Query): """ Query for the value of single field, for example: .. code-block:: python # search for files with name "query.py" Term('file.name', 'query.py') # search for occurrences of the term "query" (in either data or metadata) Term('query') """
[docs] def __init__(self, field_or_value, value=None, full=False): """ Create a new `.Term` query. :param field_or_value: the field to search, or (when *value* is not supplied) the search value :param value: value to search for (only needed when searching a specific field) :param full: search the untokenized variant of any string, see :ref:`full matches <full_match>` """ # allow first param to be the value, set field to 'text' to request a general term search if value is None: value = field_or_value # full matches won't work on text (which expands to both meta and data) field = 'meta' if full else 'text' else: field = field_or_value if isinstance(value, float): raise TypeError('float value for Term not supported: {}'.format(value)) self.field = field self.value = value self.full = full
[docs] def as_dict(self): return {'term': { 'field': self.field, 'value': _format_value(self.value), 'fullMatch': self.full, }}
[docs] class Regex(Query): """ Query a field for string-values matching a regular expression, for example: .. code-block:: python # search for replies or forwards Regex('email.subject', '(re|fw): .*', full=True) # search for bombs, or some curious misspellings Regex(re.compile(r'bo[mn]+bs')) Either a `str` or a `re.Pattern` object is accepted, of which only the ``pattern`` property is used. .. note:: - Regular expressions always match entire terms or (in case of ``full=True``) properties, as if the regular expression was anchored at both ends, see :ref:`full matches <full_match>`. - Not every feature supported by Python's `re` module (like particular character classes (``\\s`` / ``\\w``), start/end anchors (``^``/``$``), look ahead/behind or non-greedy quantifiers (``??`` / ``*?``)) will be supported by Hansken. The use of these is *not* validated by ``hansken.py``, but will result in errors when submitted. - Regular expressions queries are *always* case insensitive and ignore diacritics in values. """
[docs] def __init__(self, field_or_pattern, pattern=None, full=False): """ Create a new `.Regex` query. :param field_or_pattern: the field to match, or (when *pattern* is not supplied) the search pattern :param pattern: pattern to match, either a `str` or `re.Pattern` :param full: match the untokenized variant of the value, see :ref:`full matches <full_match>` """ # allow first param to be the pattern, set field to 'text' to request a general term match if pattern is None: pattern = field_or_pattern # full matches won't work on text (which expands to both meta and data) field = 'meta' if full else 'text' else: field = field_or_pattern if isinstance(pattern, re.Pattern): if pattern.flags & ~re.UNICODE: log.warn('regular expression flags in Query objects are ignored ({} used by pattern)', str(re.RegexFlag(pattern.flags))) # remote has no use for a Pattern object, use value's original pattern string pattern = pattern.pattern if not isinstance(pattern, str): # the only sanity check we can do on the client side raise TypeError(f'query value for Regex must be str, not {type(pattern).__name__}') self.field = field self.pattern = pattern self.full = full
[docs] def as_dict(self): return {'regex': { 'field': self.field, 'value': self.pattern, 'fullMatch': self.full, }}
[docs] class Range(Query): """ Query a field for values in a particular range, for example: .. code-block:: python # search for traces with entropy between 4.0 (exclusive) and 7.0 (inclusive) Range('data.raw.entropy', gt=4.0, max=7) # search for traces no larger than 1MiB (1 << 20 == 2 ** 20 == 1048576 bytes) Range('data.raw.size', max=1 << 20) # search for traces with peculiar names (matches file name aab.txt, but not ccb.txt) Range('file.name', min='aa', max='cc') """ _range_keys = { 'gt': '>', 'gte': '>=', 'lt': '<', 'lte': '<=', 'max': '<=', 'maxvalue': '<=', 'max_value': '<=', 'min': '>=', 'minvalue': '>=', 'min_value': '>=', }
[docs] def __init__(self, field, **ranges): """ Create a new `.Range` query. :param field: the field to query for :param ranges: keyword arguments of the following forms: - ``>``, ``gt``: value should be greater than supplied value; - ``>=``, ``gte``, ``min``, ``minvalue``, ``min_value``: value should be greater or equal to supplied value; - ``<``, ``lt``: value should be less than supplied value; - ``<=``, ``lte``, ``max``, ``maxvalue``, ``max_value``: value should be less or equal to supplied value; """ self.field = field # translate kwarg type range to hansken keys self.ranges = {self._range_keys.get(key, key): value for key, value in ranges.items()} if len(ranges) != len(self.ranges): raise ValueError('duplicate mapping in ranges for Range')
[docs] def as_dict(self): q = {'field': self.field} q.update(self.ranges) return {'range': q}
[docs] class Exists(Query): """ Search for traces that have a particular field, for example: .. code-block:: python Exists('email.headers.In-Reply-To') """
[docs] def __init__(self, field): self.field = field
[docs] def as_dict(self): return {'exists': self.field}
[docs] class Phrase(Query): """ Search for a phrase of terms, occurring within a particular distance of each other, for example: .. code-block:: python Phrase('email.subject', 'sell you a bomb') # will also match "sell you a bomb", not restricted to just email.subject Phrase('sell bomb', distance=2) """
[docs] def __init__(self, field_or_value, value=None, distance=0): """ Create a new `.Phrase` query. :param field_or_value: the field to search, or (when *value* is not supplied) the search value :param value: value to search for (only needed when searching a specific field) :param distance: the max number of position displacements between terms in the phrase (0 being an exact phrase match) """ # allow first param to be the value, set field to 'text' to request a general phrase search if value is None: value = field_or_value # switch to magical property 'text', expanding to both 'meta' and 'data' field = 'text' else: field = field_or_value if len(value) > 1000: raise ValueError('value too long for Phrase: {}'.format(len(value))) if '?' in value or '*' in value: raise ValueError('wildcard in value not allowed for Phrase: {}'.format(value)) self.field = field self.value = value self.distance = distance
[docs] def as_dict(self): return {'phrase': { 'field': self.field, 'value': self.value, 'distance': self.distance, }}
[docs] class GeoBox(Query): """ Search for traces with location data within the bounding box between two corner points: southwest and northeast, for example: .. code-block:: python # a location can either be a 2-tuple (…) GeoBox('gps.latlong', (-1, -2), (3, 4)) # (…) or an ISO 6709 latlong string GeoBox('gps.latlong', '+12.5281-070.0229', '+13.5281-080.0229') """
[docs] def __init__(self, field, southwest, northeast): self.field = field self.sw = southwest self.ne = northeast
[docs] def as_dict(self): return {'geobox': { 'field': self.field, 'southwest': _format_value(self.sw), 'northeast': _format_value(self.ne), }}
[docs] class HQLHuman(Query): """ Search for traces using HQL Human query syntax, for example: .. code-block:: python HQLHuman('file.name:query.py') HQLHuman('data.raw.size>1024') """
[docs] def __init__(self, query): if not isinstance(query, str): raise TypeError('HQL-Human query not a string: {}'.format(query)) self.query = query
[docs] def as_dict(self): return {'human': self.query}
[docs] def to_query(query): """ Make sure *query* is a `.Query` instance by either wrapping it with a `.HQLHuman` or returning it as is. :param query: either a `str` or a `.Query` :return: a `.Query` instance :raise TypeError: when *query*'s type is not acceptable """ if query is None: return None if isinstance(query, str): return HQLHuman(query) if isinstance(query, Query): return query raise TypeError('query should be either str or hansken.query.Query, not {}'.format(type(query)))
_sort_direction = { 'ascending': 'ascending', 'asc': 'ascending', '+': 'ascending', 'descending': 'descending', 'desc': 'descending', '-': 'descending', }
[docs] class Sort:
[docs] def __init__(self, field, direction=None, filter=None, mode=None, value=None): """ Creates a sort clause for use with a search request. The *mode* parameter determines what kind of sorting should be applied: - ``value``: a regular sort-by-value (the default applied by the remote); - ``exists``: simply sort on whether the sort field as a value; - ``cosineSimilarity``: use parameter *value* to sort on the cosine similarity between *value* and the value of the sort field; - ``manhattanDistance``: similarly sort on the manhattan distance (or L1 norm) between *value* and the value of the sort field; - ``euclideanDistance``: similarly sort on the euclidean distance (or L2 norm) between *value* and the value of the sort field; :param field: the field to sort on :param direction: the sorting direction (ascending or descending, or ``None`` to auto-determine the sort direction from other arguments) :param filter: an optional query to restrict the tracelets included for sorting :param mode: a sort mode (see above) :param value: a (vector) value to use for similarity / distance calculations in applicable sorting modes (see above) """ self.field = field self.filter = filter self.mode = mode self.value = value if isinstance(self.value, Vector): # if value is a supplied as a Vector object, coerce it to a str # otherwise, leave it as (already a str, None, list of floats, …) self.value = str(value) if self.value and not self.mode: # mode is required when value is supplied # default to cosine similarity if user supplied no mode self.mode = 'cosineSimilarity' if direction: # require arg to be known when provided self.direction = _sort_direction[direction.lower()] else: # no explicit direction specified, default to ascending, unless the mode is cosineSimilarity # (high similarity score is 'low distance', inverting the intuitive sorting order) self.direction = 'descending' if self.mode == 'cosineSimilarity' else 'ascending'
[docs] def as_dict(self): return omit_empty({ 'field': self.field, 'direction': self.direction, 'filter': to_query(self.filter).as_dict() if self.filter else None, 'mode': self.mode, 'value': self.value, })
[docs] @classmethod def from_str(cls, sort): """ Creates a `.Sort` from *sort*, parsing field, direction and filter. Formats supported: - ``some.field``: sort on field "some.field", ascending - ``some.field+``: sort on field "some.field", ascending - ``some.field-``: sort on field "some.field", descending - ``some.field | query*``: sort on field "some.field" within matches for query "query*", ascending (sorting non-matches after matches) :param sort: a sorting string to parse :return: a `.Sort` instance """ parts = sort.split('|', 1) direction = None field = parts[0].strip() if field[-1] in ('+', '-'): direction = _sort_direction[field[-1]] field = field[:-1] filter = None if len(parts) == 2: filter = HQLHuman(parts[1].strip()) return cls(field, direction, filter)
[docs] def to_sort(sort): if sort is None: return None if isinstance(sort, str): return Sort.from_str(sort) if isinstance(sort, Sort): return sort raise TypeError('sort should be either str or hansken.query.Sort, not {}'.format(type(sort)))
[docs] class Facet:
[docs] def __init__(self, field, size=100, include_total=None, scale=None, filter=None): self.field = field self.size = size self.include_total = include_total self.filter = to_query(filter) if scale: # discourage use of direct Facet with custom parsed scale warnings.warn(DeprecationWarning('using scale on Facet is deprecated, use one of the sub types instead')) self.scale, self.base, self.interval = _parse_scale(scale) self.precision = self.min = self.max = self.sw = self.ne = None
[docs] def as_dict(self): """ Turns this facet into a dict as specified by the Hansken Query Language Specification. """ return omit_empty({ 'field': self.field, 'size': self.size, 'includeTotal': self.include_total, 'scale': self.scale, 'base': self.base, 'interval': self.interval, # make sure to format dates and date(time)s as ISO 8601 'min': _format_value(self.min), 'max': _format_value(self.max), 'precision': self.precision, # make sure to format coordinates as ISO 6709 'southwest': _format_value(self.sw), 'northeast': _format_value(self.ne), 'filter': self.filter.as_dict() if isinstance(self.filter, Query) else self.filter })
def __str__(self): """ Encodes this facet as a JSON string. """ return json.dumps(self.as_dict())
[docs] class TermFacet(Facet):
[docs] def __init__(self, field, size=100, include_total=None, filter=None): """ Create a new `.TermFacet` to use with a query. A term facet can be created on any type of field, counting the occurrences of any value. :param field: field to create a facet on :param size: the max number of facet counters to return, default is 100 :param filter: only count traces matching filter """ super().__init__(field=field, size=size, include_total=include_total, filter=filter)
[docs] class RangeFacet(Facet):
[docs] def __init__(self, field, scale, base=None, interval=None, min=None, max=None, include_total=None, filter=None): """ Create a new `.RangeFacet` to use with a query. A range facet can be made on either numeric or date fields. :param field: field to create a facet on :param scale: - ``year``, ``month``, ``day``, ``hour``, ``minute`` or ``second`` for date fields - ``linear`` or ``log`` for numeric fields :param base: logarithmic base when scale is `'log'` :param interval: interval or bucket size when scale is `'linear'` :param min: minimum value to include in the facet result :param max: maximum value to include in the facet result :param filter: only count traces matching filter """ if scale not in ('log', 'linear', 'year', 'month', 'day', 'hour', 'minute', 'second'): raise ValueError('unknown scale: {}'.format(scale)) if scale in ('year', 'month', 'day', 'hour', 'minute', 'second') and (base or interval): # date range facet, base and interval are illegal raise ValueError('date range facet cannot be combined with base or interval') elif scale == 'log' and (not base or interval): # numeric log facet, requires base, interval is illegal raise ValueError('numeric log facet needs base, cannot be combined with interval') elif scale == 'linear' and base: # numeric linear facet, base is illegal raise ValueError('numeric linear facet cannot be combined with base') super().__init__(field=field, include_total=include_total, filter=filter) self.scale = scale self.base = base self.interval = interval self.min = min self.max = max
[docs] class GeohashFacet(Facet):
[docs] def __init__(self, field, size=100, include_total=None, precision=1, southwest=None, northeast=None, filter=None): """ Create a new `.Facet` to use with a query. :param field: field to create a facet on :param size: the max number of facet counters to return, default is 100 :param precision: number of characters of the returned geohashes :param southwest: south west bound / corner point :param northeast: north west bound / corner point :param filter: only count traces matching filter """ bounds = (southwest, northeast) if any(bounds) and not all(bounds): raise ValueError('specify either both southwest and northeast bounds or neither') super().__init__(field=field, size=size, include_total=include_total, filter=filter) self.precision = precision self.sw = southwest self.ne = northeast
[docs] def to_facet(facet): if facet is None: return None if isinstance(facet, str): return TermFacet(facet) if isinstance(facet, Facet): return facet raise TypeError('facet should be either str or hansken.query.Facet, not {}'.format(type(facet)))