from json import JSONEncoder

import sys

import re
from elasticsearch import Elasticsearch
from elasticsearch_dsl import *

import os
from os import path

pid_resolver = {
    "pdb": "http://www.rcsb.org/pdb/explore/explore.do?structureId=%s",
    "ncbi-n": "http://www.ncbi.nlm.nih.gov/gquery/?term=%s",
    "pmid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
    "pmcid": "http://www.ncbi.nlm.nih.gov/pmc/articles/%s",
    "pubmedid": "http://www.ncbi.nlm.nih.gov/pubmed/%s",
    "doi": "http://dx.doi.org/%s",
    "genbank": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
    "nuccore": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
    "swiss-prot": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
    "arrayexpress": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
    "biomodels": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
    "bmrb": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
    "ena": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
    "geo": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
    "ensembl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
    "mgi": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
    "bind": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
    "pride": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
    "ddbj": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
    "bioproject": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
    "embl": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
    "sra": "http://www.ncbi.nlm.nih.gov/nucest/%s?report=genbank",
}


def resolveIdentifier(pid, pid_type):
    if pid_type != None:
        regex = r"\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\'<>])\S)+)\b"
        if re.match(regex,pid):
            print "It should be doi"
            pid_type = 'doi'
        if pid_type.lower() in pid_resolver:
            return pid_resolver[pid_type.lower()] % pid
        else:
            if pid_type.lower() == 'openaire':
                return "https://www.openaire.eu/search/publication?articleId=%s" % pid.replace('oai:dnet:', '')
            elif pid_type.lower() == 'url':
                return  pid
            else:
                return "http://identifiers.org/%s:%s" % (pid_type, pid)
    return ""


def get_property():
    f = open(path.join(os.path.dirname(os.path.realpath(__file__)), '../../api.properties'))
    p = {}
    for line in f:
        data = line.strip().split("=")
        p[data[0].strip()] = data[1].strip()
    return p


def create_typology_filter(value):
    return Q('match', typology=value)


def create_pid_type_filter(value):
    args = {'localIdentifier.type': value}
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))


def create_pid_query(value):
    args = {'localIdentifier.id': value}
    return Q('nested', path='localIdentifier', query=Q('bool', must=[Q('match', **args)]))


def create_publisher_filter(value):
    return Q('match', publisher=value)


def create_datasource_filter(value):
    args = {'datasources.datasourceName': value}
    return Q('nested', path='datasources', query=Q('bool', must=[Q('match', **args)]))


class DLIESResponseEncoder(JSONEncoder):
    def default(self, o):
        return o.__dict__


class DLIESResponse(object):
    def __init__(self, facet=None, total=0, hits=[]):
        if facet is None:
            facet = dict(pid=[], typology=[], datasource=[])
        self.facet = facet
        self.total = total
        self.hits = hits


class DLIESConnector(object):
    def __init__(self):
        props = get_property()
        self.index_host = [x.strip() for x in props['es_index'].split(',')]
        self.client = Elasticsearch(hosts=self.index_host)
        self.index_name = props['api.index']

    def query_by_id(self, id):
        s = Search(using=self.client, index=self.index_name).doc_type('object')
        s = s.query(create_pid_query(id))
        s.aggs.bucket('typologies', 'terms', field='typology')
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
                                                                              field='datasources.datasourceName')
        s.aggs.bucket('all_publisher', 'terms', field='publisher')
        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
                                                                           field='localIdentifier.type')
        response = s.execute()

        hits = []

        for index_result in response.hits:
            input_source = index_result.__dict__['_d_']
            fixed_titles = []

            for ids in input_source.get('localIdentifier', []):
                ds = resolveIdentifier(ids['id'], ids['type'])
                ids['url'] = ds
            for t in input_source.get('title', []):
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
                    fixed_titles.append(t[1:-1])
                else:
                    fixed_titles.append(t)
            input_source['title'] = fixed_titles
            hits.append(input_source)

        pid_types = []
        for tag in response.aggs.all_pids.all_types.buckets:
            pid_types.append(dict(key=tag.key, count=tag.doc_count))

        datasources = []
        for tag in response.aggs.all_datasources.all_names.buckets:
            datasources.append(dict(key=tag.key, count=tag.doc_count))

        typologies = []
        for tag in response.aggs.typologies.buckets:
            typologies.append(dict(key=tag.key, count=tag.doc_count))

        publishers = []
        for tag in response.aggs.all_publisher.buckets:
            if len(tag.key) > 0:
                publishers.append(dict(key=tag.key, count=tag.doc_count))

        return DLIESResponse(total=response.hits.total,
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
                                        publishers=publishers), hits=hits)

    def simple_query(self, textual_query, start=None, end=None, user_filter=None):
        s = Search(using=self.client, index=self.index_name).doc_type('object')
        q = Q('match', _all=textual_query)
        s.aggs.bucket('typologies', 'terms', field='typology')
        s.aggs.bucket('all_datasources', 'nested', path='datasources').bucket('all_names', 'terms',
                                                                              field='datasources.datasourceName')
        s.aggs.bucket('all_publisher', 'terms', field='publisher')

        filter_queries = []
        if user_filter is not None and len(user_filter) > 0:
            for f in user_filter.split('__'):
                filter_key = f.split('_')[0]
                filter_value = f.split('_')[1]
                if filter_key == 'typology':
                    filter_queries.append(create_typology_filter(filter_value))
                elif filter_key == 'datasource':
                    filter_queries.append(create_datasource_filter(filter_value))
                elif filter_key == 'pidtype':
                    filter_queries.append(create_pid_type_filter(filter_value))
                elif filter_key == 'publisher':
                    filter_queries.append(create_publisher_filter(filter_value))

        if len(filter_queries) > 0:
            s = s.query(q).filter(Q('bool', must=filter_queries))
        else:
            s = s.query(q)

        s.aggs.bucket('all_pids', 'nested', path='localIdentifier').bucket('all_types', 'terms',
                                                                           field='localIdentifier.type')

        if start is not None:
            if end is None:
                end = start + 10
            s = s[start:end]
        response = s.execute()

        hits = []

        for index_result in response.hits:
            input_source = index_result.__dict__['_d_']
            fixed_titles = []

            for ids in input_source.get('localIdentifier', []):
                ds = resolveIdentifier(ids['id'], ids['type'])
                ids['url'] = ds
            for t in input_source.get('title', []):
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
                    fixed_titles.append(t[1:-1])
                else:
                    fixed_titles.append(t)
            input_source['title'] = fixed_titles
            hits.append(input_source)

        pid_types = []
        for tag in response.aggs.all_pids.all_types.buckets:
            pid_types.append(dict(key=tag.key, count=tag.doc_count))

        datasources = []
        for tag in response.aggs.all_datasources.all_names.buckets:
            datasources.append(dict(key=tag.key, count=tag.doc_count))

        typologies = []
        for tag in response.aggs.typologies.buckets:
            typologies.append(dict(key=tag.key, count=tag.doc_count))

        publishers = []
        for tag in response.aggs.all_publisher.buckets:
            if len(tag.key) > 0:
                publishers.append(dict(key=tag.key, count=tag.doc_count))

        return DLIESResponse(total=response.hits.total,
                             facet=dict(pid=pid_types, typology=typologies, datasource=datasources,
                                        publishers=publishers), hits=hits)

    def related_type(self, object_id, object_type, start=None):
        args = {'target.objectType': object_type}
        query_type = Q('nested', path='target', query=Q('bool', must=[Q('match', **args)]))
        args_id = {'source.dnetIdentifier': object_id}
        query_for_id = Q('nested', path='source', query=Q('bool', must=[Q('match', **args_id)]))
        s = Search(using=self.client).index(self.index_name).doc_type('scholix').query(query_for_id & query_type)
        if start:
            s = s[start:start + 10]

        response = s.execute()
        hits = []

        for index_hit in response.hits:
            current_item = index_hit.__dict__['_d_']
            if 'target' in current_item:
                ids = []
                for item in current_item['target']['identifier']:
                    c_it = item
                    c_it['url'] = resolveIdentifier(item['identifier'], item['schema'])
                    ids .append(c_it)
                current_item['target']['identifier'] = ids
            hits.append(current_item)

        return hits

    def fix_collectedFrom(self, source, relation):
        if relation is None:
            return
        relSource = relation.get('source')
        collectedFrom = relSource['collectedFrom']
        for coll in collectedFrom:
            for d in source['datasources']:
                if d['datasourceName'] == coll['provider']['name']:
                    d['provisionMode'] = coll['provisionMode']
        return source

    def item_by_id(self, id, type=None, start=None):
        try:
            res = self.client.get(index=self.index_name, doc_type='object', id=id)
            hits = []
            input_source = res['_source']
            fixed_titles = []
            for t in input_source.get('title',[]):
                if len(t) > 0 and t[0] == '"' and t[-1] == '"':
                    fixed_titles.append(t[1:-1])
                else:
                    fixed_titles.append(t)
            input_source['title'] = fixed_titles

            for ids in input_source.get('localIdentifier', []):
                ds = resolveIdentifier(ids['id'], ids['type'])
                ids['url'] = ds
            related_publications = []
            related_dataset = []
            related_unknown = []

            rel_source = None
            if input_source.get('relatedPublications') > 0:
                if 'publication' == type:
                    related_publications = self.related_type(id, 'publication', start)
                else:
                    related_publications = self.related_type(id, 'publication')
                if len(related_publications) > 0:
                    rel_source = related_publications[0]
                else:
                    rel_source = {}
            if input_source.get('relatedDatasets') > 0:
                if 'dataset' == type:
                    related_dataset = self.related_type(id, 'dataset', start)
                else:
                    related_dataset = self.related_type(id, 'dataset')
                rel_source = related_dataset[0]
            if input_source.get('relatedUnknown') > 0:
                if 'unknown' == type:
                    related_unknown = self.related_type(id, 'unknown', start)
                else:
                    related_unknown = self.related_type(id, 'unknown')
                rel_source = related_unknown[0]

            input_source = self.fix_collectedFrom(input_source, rel_source)
            hits.append(input_source)

            hits.append(dict(related_publications=related_publications, related_dataset=related_dataset,
                             related_unknown=related_unknown))

            return DLIESResponse(total=1, hits=hits)
        except Exception as e:
            print "Error on getting item "
            print e
            print "on line %i" % sys.exc_traceback.tb_lineno
            return DLIESResponse()