# TODO: 
# - clean the code and structure it better
# - write a 'merge'-function
import pandas as pd
import numpy as np
import time
import shutil
import namedentities as ne
import html
import webbrowser
from IPython.display import Markdown, display
# from scholarly import scholarly

import os

os.environ['PYB_CONFIG_FILE'] = "/home/nano/.pybliometrics/config.ini"

from pybliometrics.scopus import AbstractRetrieval
from fp.fp import FreeProxy
from crossref.restful import Works
from difflib import SequenceMatcher

from django.template.loader import get_template, render_to_string
from django.template import Context
from django.http import HttpResponse

def pdflatex_cv(queryset):
    # Creates PDF file and pdf short 
    from PyPDF2 import PdfReader, PdfWriter
    import subprocess
    from internal.models import CV
    for qs in queryset:
        entry = CV.objects.get(id = qs.id)
        
        p = subprocess.Popen(['pdflatex', 'cv.tex', '-interaction=nonstopmode'], cwd='/var/www/nanotud/uploads/cv_files/')
        p.wait()
        
        entry.pub_include='uploads/cv_files/auto_include/papers_auto_django.tex'
        entry.log_file='uploads/cv_files/cv.log'
        entry.pdf_file='uploads/cv_files/cv.pdf'

        entry.tex_file='uploads/cv_files/cv.tex'
        entry.publication_include='uploads/cv_files/auto_include/papers_auto_django.tex'
        entry.conferences_include='uploads/cv_files/auto_include/conferences_auto'
        entry.patents_include='uploads/cv_files/auto_include/patents_auto'
        entry.papers_selected_include='uploads/cv_files/auto_include/papers_auto_sel.tex'
        entry.phds_include='uploads/cv_files/auto_include/phds_auto'
        entry.talks_include='uploads/cv_files/auto_include/talks_auto'
        entry.rest_include='uploads/cv_files/auto_include/rest_auto'
        
    
        # create cv_short.pdf (only first two pages of cv.pdf)
        pdf = PdfReader('/var/www/nanotud/uploads/cv_files/cv.pdf')
        pages=[0,1]
        pdfWriter = PdfWriter()
        for page_num in pages:
            pdfWriter.add_page(pdf.pages[page_num])
        with open('/var/www/nanotud/uploads/cv_files/cv_short.pdf', 'wb') as f:
            pdfWriter.write(f)
            f.close()
        entry.pdf_short='uploads/cv_files/cv_short.pdf'
        entry.save()
        shutil.copy2('/data/uploads/cv_files/cv.pdf','/data/uploads/cv_files/cv_backup.pdf')
    
def create_publication_include(queryset):
    # update papers_auto_django.tex for CV from publication django DB
    from research.models import Publication
    from internal.models import CV
    
    context = {'publication_list' : Publication.objects.filter(author_list__icontains='Cuniberti')}
    # context = {'publication_list' : Publication.objects.filter(chair_pub=True)}
    content = render_to_string('publication_export.tex', context, using='tex')
    with open ('/var/www/nanotud/uploads/cv_files/auto_include/papers_auto_django.tex','w') as static_file:
        static_file.write(content)
        
    context = {'publication_list' : Publication.objects.filter(cvselected=True)}
    # context = {'publication_list' : Publication.objects.filter(chair_pub=True)}
    content = render_to_string('publication_export_selected.tex', context, using='tex')
    with open ('/var/www/nanotud/uploads/cv_files/auto_include/papers_auto_sel.tex','w') as static_file:
        static_file.write(content)  
    
    
    for qs in queryset:
        entry = CV.objects.get(id = qs.id)
        entry.publication_include='uploads/cv_files/auto_include/papers_auto_django.tex'
        entry.papers_selected_include='uploads/cv_files/auto_include/papers_auto_sel.tex'
        entry.save()

def set_slug_2(queryset):
    from internal.models import Member
    members = Member.objects.filter()
    for member in members:
        member.save()
    return
    
def set_slug_3(queryset):
    from research.models import Publication
    pubs = Publication.objects.filter()
    for pub in pubs:
        pub.save()
    return

def set_slug_4(queryset):
    from external.models import Event
    evs = Event.objects.filter()
    for n in evs:
        n.save()
    return
    
    
'''def set_slug(queryset):
    from research.models import Project
    projects = Project.objects.filter()
    for project in projects:
        project.save()
    return'''
    
'''def set_slug(queryset):
    from teaching.models import Seminar
    seminars = Seminar.objects.filter()
    for sem in seminars:
        sem.save()
    return'''


def set_slug(queryset):
    from research.models import Publication
    pubs = Publication.objects.filter()
    for sem in pubs:
        sem.save()
    return

    
def add_dois_scholar_info(queryset):
    if len(queryset) == 1:
        for qs in queryset:
            print(f'Find publications for {qs.author} from {qs.year} in Google Scholar')
            search_query = scholarly.search_author(qs.author)  # receive info from google scholar
            author = scholarly.fill(next(search_query))
            pub_google = pd.DataFrame()    # creat a DF containing all publications
            for pub in author['publications']:
                pub_google = pub_google.append(pub['bib'], ignore_index=True, sort=False)
            pub_google['google_pub'] = author['publications']
            # pub_google.sort_values('pub_year', ascending=False, inplace=True)
            pub_google = pub_google[~pub_google['title'].isnull()]    # delete all entries with no title
            pub_google['title'] = pub_google['title'].astype('str')
            if qs.year == None:
                pub_year = pub_google
            else:
                pub_year = pub_google[pub_google['pub_year'] == str(qs.year) ]   # select only publication from certain year
            print (f"{len(pub_year)} publications found")
            if len(pub_year) == 0:
                entry = Add_Publication.objects.get(id = qs.id)
                entry.comment='No publication found in Google Scholar'
                entry.save()
                return 
            # get dois for title, NO
            # dois = []
            from research.models import Add_Publication
            for year, title in zip(pub_year.pub_year, pub_year.title):
                if pd.isna(year):
                    year = None
                else:
                    year=int(year)
                
                # print ('search DOI for', title, 'in Crossref')
                # new_doi = find_doi_for_title(title, author_preset=qs.author.split(',')[0])
                # print (new_doi,' found')
                # if pd.isna(new_doi):
                    # new_doi = None
                # new_pub = Add_Publication(doi=new_doi, title=title, year=year, author=qs.author)
                new_pub = Add_Publication(title=title, year=year, author=qs.author.split(',')[0])
                new_pub.save()
                # dois = dois + [new_doi]
            # pub_year['doi'] = dois
        
    else:
        print ("Please select only one entry")


def clean_abstract(txt):
    if txt == None:
        return None
    string=''
    if txt[0] == '©' or 'Copyright' in txt[0:15] or 'c○' in txt[:5] or 'COPYRIGHT' in txt[0:15] or 'This journal is' in txt[:30]:
        if txt.find('Weinheim')!= -1:
            string = string + txt[txt.find('Weinheim')+8:] +'\n'
        elif txt.find('Switzerland') !=-1:
            string = string + txt[txt.find('Switzerland')+11:]
        else:
            string = string + txt[txt.find('.')+1:]     
    else:
        if '©' in txt[-100:]:
            string = string + txt[:txt.find('©')]
        else:
            string = string + txt 
    string.replace('</jats:sub>','')
    string.replace('<jats:sub>','')
    string.replace('<jats:title>Abstract</jats:title>','')
    return string


def check_unicode_char(inp_str):
    new=''
    for c0 in inp_str:
        if ord(c0)==8211:
            new+='-'
        else:
            new += c0
    return new


def add_publication_info(queryset):
    from research.models import Publication, Add_Publication
    from internal.models import Member
    
        #from pybliometrics.scopus.utils import config
    #import json
    # with open('/etc/config.json') as config_file:
    #    config2 = json.load(config_file)
    # config['Authentication']['APIKey']=config2['APIKey']
    for qs in queryset:
        kaka= False
        entry = Add_Publication.objects.get(id = qs.id)
        doi = str(qs.doi).strip('')    # remove spaces at beginning and end
        if doi == None or doi == '' or doi== 'None':
            if qs.title != None:
                print ('herezero')
                doi = find_doi_for_title(qs.title, qs.author)
                if doi == None or pd.isna(doi):
                    entry.comment='No DOI found for title'
                    entry.save()
                    kaka=True
                else:
                    entry.doi = doi
                    entry.save()
                    
                
        num = Publication.objects.filter(doi__iexact = doi).count()   # count number publications with same DOI
        title, authors, volume, pub_date, year, open_access, page,typee, number, journal_short, journal = None, None, None, None, None , None, None, None, None, None, None
        if num == 0 and kaka==False:   # DOI not in Publications yet
            try:
                works = Works()    # check Crossref entry
                item_CR = works.doi(doi)
            except:
                print (doi , ' not found in Crossref.')
                item_CR = None
            try:
                item_scopus = AbstractRetrieval(doi, refresh = True)
            except:
                print(doi , ' not found in Scopus. API key and permission correct?(2)')
                item_scopus = None
            if item_CR == None and item_scopus == None:
                entry.comment='DOI neither found in Scopus nor in Crossref. Please do check if DOI is correct.'
                entry.save()
            else: # either crossref or scopus source, fill publication with info
                abstract = clean_abstract(get_abstract(item_CR, item_scopus))
                if item_scopus != None:
                    title = html.unescape(item_scopus.title)
                    title = check_unicode_char(title)
                    authors = html.unescape(format_authors_scopus(doi, item_scopus))
                    volume = item_scopus.volume
                    if item_scopus.sourcetitle_abbreviation != None:
                        journal_short = item_scopus.sourcetitle_abbreviation
                        journal_short = check_unicode_char(journal_short)
                    if item_scopus.publicationName != None:
                        journal = item_scopus.publicationName
                        journal = check_unicode_char(journal)
                    try:
                        page = item_scopus.pageRange.split('-')[0].strip()
                    except:
                        try:
                            page = item_scopus.startingPage
                        except:
                            page = None
                    try:
                        pub_date = item_scopus.coverDate
                        year = pub_date[0:4]
                    except:
                        pub_date = None
                        year = None
                    try:
                        open_access = item_scopus.openaccessFlag
                    except:
                        open_access = None
                if item_CR != None:
                    if authors == None:
                        authors = format_authors_crossref(doi, item = item_CR )
                        
                    if title == None:
                        title = item_CR['title'][0]
                    if volume == None:
                        try:
                            volume = str(item_CR['volume'])    
                        except:
                            pass
                    if typee == None:
                        typee = item_CR['type']
                    if pub_date == None:
                        try:
                            pub_date = pub_date_cross(item_CR)
                            pub_date = f"{pub_date.split('.')[0]}-{pub_date.split('.')[-1]}-{pub_date.split('.')[1]}"
                        except:
                            pub_date = None
                        
                    if page == None:
                        try:
                            page = item_CR['page']
                        except:
                            pass
                    if number == None:
                        try:
                            number = item_CR['issue']
                        except:
                            pass
                    if journal_short == None:
                        try:
                            journal_short = item_CR['short-container-title'][0]
                        except:
                            pass
                    if journal == None:
                        try:
                            journal = item_CR['container-title'][0]
                        except:
                            pass
                    if year == None:
                        try:
                            year = int(item_CR['created']['date-parts'][0][0])
                        except:
                            pass
                    # if open_access== None:
                    #     open_access = False #default value
                if journal_short == None:
                    journal_short = journal
                    if journal_short == None:
                        journal_short = 'XXXX'

                pub = Publication(doi=doi, approved = False, year = year, title=title, author_list=authors,
                                    pub_date=pub_date, volume=volume, open_access=open_access,
                                    abstract=abstract, page=page, journal_short=journal_short, journal=journal, pub_type=typee,
                                    chair_pub=True,)
                                    
                pub.save()
                if item_CR == None:
                    entry.comment='Information taken from Scopus.'
                elif item_scopus == None:
                    entry.comment='Information taken from Crossref.'
                else:
                    entry.comment='Information taken from Scopus and Crossref.'
                entry.added = True
                entry.save()
        elif kaka==True:
            entry.comment='No DOI found.'
            entry.added = False
        
        else:
            print ("publication already there")
            entry.comment='DOI already in publication list. If you want to add it again, please delete the corresponding publication first.'
            entry.doi_in_pubs= True
            entry.added = False
            entry.save()
        idd = Member.objects.all()
        # .filter(active=True)
        for i in idd:
            pubs = Publication.objects.filter(approved =  False).filter(author_list__icontains = i.given_name[0]+'. '+i.last_name )
            for pi in pubs:
                i.publication.add(pi)
    return

def read_publications(file_name):
    """
    Reads publication from database given by the format of the webpage.
    
    Parameters
    ----------
    file_name : name of the database file
        Structure: Two columns (1 = property, 2 = value )
                   Entries are separated by 'enditem\n##################################'
            Example:
                '##########################################################################
                AG GC
                cvselected    no
                firstlast yes
                #filename
                cover ../pubs/reprints/covers/2020_Krok.jpg
                year 2020
                title        Modification of titanium implants using biofunctional nanodiamonds for enhanced antimicrobial properties
                authors Emilia Krok, Sascha Balakin, Jonas Jung, Frank Gross, J&ouml;rg Opitz, <b>G. Cuniberti</b>
                pubdate 2020.03.02
                type        regular
                reference    <i>Nanotechnology</i> <b>31</b>, 205603 (2020)
                DOI        10.1088/1361-6528/ab6d9b/
                abstract    The present study describes a novel antimicrobial surface using anodic oxidation of titanium and biofunctional
                detonation nanodiamonds (ND). ND have been loaded with antibiotics (amoxicillin or ampicillin) using poly
                (diallyldimethylammonium chloride)(PDDA). Successful conjugation with PDDA was determined by dynamic light scattering, …
                enditem
                ###########################################################################
                ...'

    Returns
    -------
    DataFrame
        Containing all publication entries from 'file_name'. Each row contains one publication and
        columns its values. If one property is not set in file_name, its value is set to 'np.NaN'.
    """
    with open(file_name, 'r') as file:
        content = file.read() 
    all_publications = content.split('enditem')[1:-2]      # list of all publications
    
    for article in all_publications:        # iterate over each publication
        attributes = article.split('\n')                   # each line contains property and value
        dictio = {}
        for att in attributes:
            att = att.replace('    ', '\t')
            try:
                # the following lines are so messy as the datafile has no homogenous format
                # sometimes properties and values are seperated by a space, sometimes by 4 spaces
                # one tab, two tabs, ... but as this goes back to 1996 ...
                # check if the line contains information to read or not:
                if (att[-1] != '#') and (att != '') and ('#' not in att[0:4]):
                    if ('abstract' in att[0:12]):      # line contains the abstract ?
                        dictio['abstract'] =  att.split('abstract')[-1].strip()
                    elif 'journalfullname' in att:
                        dictio['journalfullname'] = att.split(' ', maxsplit=1)[-1].strip()
                    elif 'arxivpasswd' in att:
                        dictio['arxivpasswd'] = att.split(' ', maxsplit=1)[-1].strip()
                    else:
                        split = att.split('\t', maxsplit=1)   # in case value given: first element=property
                                                              # and second element is value
                        if len(split) == 1:                   # sometimes seperated by a space
                            split = att.split(' ', maxsplit=1)
                        if len(split) == 1:
                            dictio[split[0].strip()] = np.NaN # no split possible means no value
                        else:
                            dictio[split[0].strip()] = split[-1].strip()  # dictio[property] = value
            except:   # empty line
                pass
        if dictio != {}:
            try:        
                df = df.append(dictio, ignore_index=True)     # append new entry to DF
            except:      
                df = pd.DataFrame(dictio, index=[0])          # first time create the DF
    df = df.fillna(np.NaN)
    return df


def entry_to_dict(entry):
    attributes = entry.split('\n')                   # each line contains property and value
    dictio = {}
    for att in attributes:
        att = att.replace('    ', '\t')
        try:
            if (att[-1] != '#') and (att != '') and ('#' not in att[0:3]):
                split = att.split('\t', maxsplit=1)   # in case value given: first element=property
                if len(split) == 1:                   # sometimes seperated by a space
                    split = att.split(' ', maxsplit=1)
                    if len(split) == 1:
                        dictio[split[0].strip()] = np.NaN # no split possible means no value
                    else:
                        dictio[split[0].strip()] = split[-1].strip()  # dictio[property] = value
                else:
                    dictio[split[0].strip()] = split[-1].strip()  #
        except:
            pass
    return dictio
def read_addresses(file_name):
    
    with open(file_name, 'r') as file:
            content = file.read() 
    all_entries = content.split('enditem')
    
    for entry in all_entries:
        dictio = entry_to_dict(entry)
        if dictio != {}:
            try:
                df = df.append(dictio, ignore_index=True)     # append new entry to DF
            except:
                df = pd.DataFrame(dictio, index=[0])          # first time create the DF
    df = df.fillna(np.NaN)
    ind = df[df['vorname'].isna() & df['nachname'].isna()].index.values
    df.drop(ind,inplace=True)
    return df
    
def write_addresses(file_name, df):
    seperator = 'enditem\n#################################################\n'
    with open(file_name, 'w') as file:
        file.write(seperator.split('\n',maxsplit=1)[1])
        for index, row in df.iterrows():
            # write only not NaN elements:
            to_write = row[~row.isna()]
            for index_w, value in to_write.items():
                if len(index_w)> 7:
                    file.write(index_w + '\t' + str(value) + '\n' )
                else:
                    file.write(index_w + '\t\t' + str(value) + '\n' )
            file.write(seperator)
    return

def printmd(string, color=None):
    # formated printing in Jupyter Notebook
    # markdown commands (bold : '**', italics : '*', ....)
    colorstr = '<span style=color:{}>{}</span>'.format(color, string)
    display(Markdown(colorstr))

    
def correct_authors_and_doi(df):
    # correct all authors if DOI can be found on Scopus or Crossref
    N_new_dois = 0           # counter for new DOIs
    N_corrected_dois = 0     # counter for corrected DOIs
    N_authorlist = 0
    authors_new = []         # list with formated authors
    for ind in df.index:
        DOI = df.loc[ind].DOI
        if not pd.isna(DOI):
            DOI = str(DOI).strip(' ')    # remove spaces at beginning and end
        if pd.isna(DOI) or DOI == '':    # if DOI not given --> find_doi
            DOI = find_doi_for_title(df.loc[ind].title)
            df.loc[ind,'DOI'] = DOI
            if not pd.isna(DOI):
                print('Found the DOI(' + str(DOI) + ') for:', df.loc[ind].title)
                N_new_dois += 1
        if pd.isna(DOI):       # still no DOI found: cannot correct the authors
            authors_new.append(np.NaN)
            if df.loc[ind].type != 'patent' and df.loc[ind].type != 'thesis':    # patent/thesis no DOI
                print(f'No authorlist found for {df.loc[ind].title} ({df.loc[ind].type})')    
        else:                                        # if we have DOI --> correct author string:
            author_ = get_authors_doi(DOI)
            if author_ != None:                   # successful created string for authors
                authors_new.append(author_)
                N_authorlist += 1
            else:                                  # doi is incorrect
                new_doi = find_doi_for_title(df.loc[ind].title)
                if pd.isna(new_doi) or new_doi == DOI:
                    authors_new.append(np.NaN)
                else:                              # found new DOI: old one needs to be corrected!
                    print(f'Found the correct DOI:{new_doi} for the old one:{DOI}')  
                    N_corrected_dois += 1
                    df.loc[ind,'DOI'] = new_doi     # overwrite the old
                    author_ = get_authors_doi(new_doi)
                    if author_ != None:
                        authors_new.append(author_)
                        N_authorlist += 1
                    else:                            # found correct DOI, not able create author string
                        authors_new.append(np.NaN)
    printmd(f'**{str(N_new_dois)} new DOIs has been found and {str(N_corrected_dois)} incorrect DOIs have been corrected**.', color='red')
    print('We also have created {} formated author list entries'.format(N_authorlist)) 
    df['authors_new'] = authors_new
    return df
    
     
def find_doi_for_title(title, author_preset = None):
    """Looks for the DOI in Scopus and Crossref.
    """
    #####################
    # SCOPUS:
    try:
        document_search = ScopusSearch(f'title({title})', verbose=True)  
        if len(document_search.results) == 1 :     # just one match found, good!
            doi_scopus = document_search.results[0].doi
            return doi_scopus  
    except:  # nothing found in Scopus
        pass
    #####################
    #####################
    # CROSSREF:
    item = find_crossref_for_title(title, author_search=author_preset, journal=None)
    if item != None:     # we found the DOI in crossref
        return item['DOI']
    else:                  # neither in Scopus nor Crossref found DOI
        return np.NaN
        

def find_crossref_for_title(title, author_search, journal):
    # searches a corresponding match for 'title' in Crossref
    # it compares the similarity of the given title and the title found by Crossref
    # if similiarity is bigger than 0.8 (1.0 is identical) means we found the article
    # RETURN: item (item['DOI'] for example is the DOI)
    sim = 0.0               # initial similiarity of 0.0 
    works = Works()         # from crossref.restful 
    if journal == None:     # crossref query withour journal
        if author_search == None:
            w1 = works.query(bibliographic=title)
        else:
            w1 = works.query(bibliographic=title, author=author_search)
    else:
        if author_search == None:
            w1 = works.query(bibliographic=title, container_title=journal)
        else:
            w1 = works.query(bibliographic=title, author=author_search, container_title=journal)
    for item in w1:         # loop over crossref results
        # calculates the similiarity between 'title'
        # and the results found by Crossref
        try:
            sim_new = SequenceMatcher(None, item['title'][0], title).ratio() 
        except:
            sim_new = 0.0 
        if sim_new > 0.8:   # article title found
            return item
    return None


def get_authors_doi(DOI):
    # formats the string of authors for a publication when DOI is given
    # wanted format: 'E. Krok, S. Balakin, J. Jung, F. Gross, J. Opitz, and <b>G. Cuniberti</b>''
    try:
        return format_authors_scopus(DOI)
    except:
        try:
            return format_authors_crossref(DOI)
        except:
            return None
        

def format_authors_crossref(DOI, item = None ):
    """
    For a given DOI the authors will be searched in Crossref and will be
    formated in the following way:
    'E. Krok, S. Balakin, J. Jung, F. Gross, J. Opitz, and <b>G. Cuniberti</b>''
    """
    if item == None:
        works = Works()
        item = works.doi(DOI)  #search in crossref for given DOI
    string = ''
    author_list = [x for x in item['author'] if 'family' in x]
    for i, person in enumerate(author_list):
        if i == (len(item['author']) - 1):                  # last author is separated by 'and'
            string += 'and '
        if person['family'].capitalize() == 'Cuniberti':    # Cuniberti in bold font
            string += '' # '<b>'
        for first in person['given'].split():               # more than one given Name
            if first[0]!= '-' and first[0]!= '–':
                if len(first.split('-')) > 1:
                    for sp in first.split('-'):
                        string += sp[0].upper() + '.-'
                    string = string[:-1]  + ' '
                else:
                    string += first[0].upper() + '. '
            else:
                string += '-'+ first[1].upper() + '. '
        if person['family'].capitalize() == 'Cuniberti':
            string = string + 'Cuniberti, ' # 'Cuniberti</b>, '
        else:
            if person['family'].isupper():
                string += person['family'].capitalize() + ', '
            else:
                string += person['family'] + ', '
    string=string[:-2] # removes the last comma
    return string 
 
    
def format_authors_scopus(DOI, article=None):
    """
    For a given DOI the authors will be searched in Scopus and formated in the following way:
    'E. Krok, S. Balakin, J. Jung, F. Gross, J. Opitz, and <b>G. Cuniberti</b>''
    """
    if article == None:
        article = AbstractRetrieval(DOI)
    string = ''
    for i, person in enumerate(article.authors):
        if i == (len(article.authors) - 1):
            string += 'and '
        if person.surname.capitalize() == 'Cuniberti':
            string +=  '' #'<b>'
        if person.given_name != None and person.given_name != 'None':
            for first in person.given_name.split():
                if first[0] != '-' and first[0] != '–':
                    if len(first.split('-')) > 1:
                        for sp in first.split('-'):
                            string += html_coding(sp[0].upper()) + '.-'
                        string = string[:-1]  + ' '
                    else:
                        string += html_coding(first[0].upper()) + '. '
                else:
                    string += '-' + html_coding(first[1].upper()) + '. '
        if person.surname.capitalize() == 'Cuniberti':
            string = string + 'Cuniberti, '  #'Cuniberti</b>, '
        else:
            if person.surname.isupper():
                string += html_coding(person.surname.capitalize()) + ', '
            else:
                string += html_coding(person.surname) + ', '
    string=string[:-2] # removes the last comma
    return string


def html_coding(string):
    """Converts string to HTML entities.
    """
    string = ne.named_entities(html.escape(string))
    return string


def write_corrected_db(df, filename):
    """
    Writes the Dataframe df in the structure needed by the webpage to be imported.
    The database is saved in the file 'filename'.
    """
    seperator = 'enditem\n#################################################\n'
    sep = {'AG':'\t\t' , 'cvselected':'\t' , 'type':'\t\t' ,
           'firstlast':'\t', 'cover':'\t\t', 'year':'\t\t', 'title':'\t\t',
           'authors':'\t\t', 'pubdate':'\t\t', 'reference':'\t', 'DOI':'\t\t',
           'abstract':'\t', 'filename':'\t', 'nonmiei':'\t\t', 'ECEMP':'\t\t',
           'Cover':'\t\t', 'publisher':'\t', 'submisdate':'\t', 'ISBN':'\t\t',
           'bbllx':'\t\t', 'bblly':'\t\t', 'bburx':'\t\t', 'bbury':'\t\t',
           'field':'\t\t', 'hyperref':'\t', 'journalfullname':' ','journalref':'\t',
           'arxiv':'\t\t', 'arxivmesh':'\t', 'endpage':'\t\t', 'refreport_no':'\t',
           'report_no':'\t', 'startpage':'\t', 'volume':'\t\t', 'journalacronym':'\t',
           'WCU':'\t\t', 'ISSN':'\t\t', 'issue':'\t\t', 'page':'\t\t', 'IF':'\t\t',
           'acceptdate':'\t', 'revisedate':'\t', 'webpubdate':'\t', 'ESF':'\t\t',
           'ZIH':'\t\t', 'pudate':'\t\t', 'booktitle':'\t', 'editors':'\t\t',
           'order_it':'\t', 'pubtown':'\t\t', 'reference_more':'\t', 'pISSN':'\t\t',
           'address':'\t\t', 'WCUTOPTEN':'\t', 'stampamelo':'\t', 'vjntref':'\t\t',
           'openaccess':'\t', 'AG2':'\t\t', 'journalcover':'\t', 'arxivdate':'\t',
           'commentcv':'\t', 'addmaterial':' ', 'receivdate': '\t', 'IFyear':'\t\t',
           'PRaccessioncode':' ', 'PRstatusURL':' ', 'IFref':'\t\t', 'place':'\t\t',
           'misure':'\t\t', 'librarycongress':' ', 'series':'\t\t', 'coverdetails':'\t',
           'referenceshort':'\t', 'IEEEcn':'\t\t', 'INSPECan':'\t', 'econophysics':'\t',
           'other_order_it':'\t', 'senzaISBNlink':'\t'}
#     , 'arxivpasswd':' '
    with open(filename, 'w') as file:
        for ind in df.index:
            for key, spacing in sep.items():    ## TODO: following if-else can be simplified!
                if key in df.columns:
                    if pd.isna(df.loc[ind][key]) or df.loc[ind][key] == 'None' or str(df.loc[ind][key]).strip() == '':
                        pass   # do nothing
                    else:
                        if key == 'authors' and (not pd.isna(df.loc[ind]['authors_new'])):
                            file.write(key + spacing + str(df.loc[ind]['authors_new']) + '\n' )
                        else:
                            file.write(key + spacing + str(df.loc[ind][key]) +'\n' )
                else:
                    pass
#                     print (key, 'not in columns')
            file.write(seperator)
    print('Database written to file ', filename)
    return


def get_pub_google(searchfield, use_proxy=False):
    """
    Receives all publications from Google Scholar for a given searchfield
    
    Returns
    -------
        DataFrame
            DataFrame with all publications on Google Scholar with short info (title, year, bib)
    """
    global scholarly
    if use_proxy == True:                # use a proxy if blocked by Google
        use_proxy_scholarly()
    print('If nothing happen in the next minute your IP might already be blocked by Google')
    print('Try: proxy, VPN, TOR, ...')
    search_query = scholarly.search_author(searchfield)       # Retrieve author's data  
    author = next(search_query)
    author = author.fill(sections=['publications'])           # fill-in publication info
    pub_google = pd.DataFrame()                               # create a DF containing all publications
    for pub in author.publications:
        pub_google = pub_google.append(pub.bib, ignore_index=True, sort=False)
    pub_google['google_pub'] = author.publications
    pub_google.sort_values('year', ascending=False, inplace=True)
    pub_google = pub_google[~pub_google['title'].isnull()]    # delete all entries with no title
    pub_google['title'] = pub_google['title'].astype('str')
    return pub_google


def use_proxy_scholarly():
    global scholarly
    print('Let\'s try to use a proxy...')
    proxy = FreeProxy(rand=True, timeout=1,country_id=['DE','US', 'CA']).get()
    scholarly.use_proxy(http=proxy, https=proxy)
    printmd(f'**proxy is used to access Google with:**{str(proxy)}')
    return None


def google_details_publication(DF):
    # fill details from Google (DOI missing)
    printmd('Fill all informations available at **Google** for the publications:', color="blue")
    pub_filled = pd.DataFrame()
    for i in DF.index:        
        print (DF['title'].loc[i])
        pub_filled = pub_filled.append(DF['google_pub'][i].fill().bib,
                                       ignore_index=True, sort=False )
        time.sleep(2) # to not get blocked by Google
    pub_filled['year'] = pub_filled['year'].astype('int')
    printmd('**Finished**', color='blue')
    return pub_filled


def get_details_crossref_scopus(pub_filled, author = 'Cuniberti'):
    """
    Function that gets DOI and title from Crossref and/or Scopus and writes it into the dataframe.
    If volume or issue is not given by Google it will search for it at Crossref and Scopus.
    """
    (doi_list, title_list, item_list, pub_date,
     journal_short, abstract_list, cover_date,first_last) =[], [], [], [], [], [], [], []
    # create conditions for missing values:
    volume_condition = pub_filled['volume'].isnull()
    number_condition = pub_filled['number'].isnull()
    pages_condition = pub_filled['pages'].isnull()
    
    print('searching in crossref and scopus for:')
    for element, row in pub_filled.iterrows():
        print(row['title'])
        # search at crossref:
        item_CR  = find_crossref_for_title(row['title'], author, row['journal'])
        if item_CR != None :             # if soemthing found on crossref look at scopus with the DOI
            try:
                item_scopus = AbstractRetrieval(item_CR['DOI'], view='FULL')
                try:
                    cover_date.append(item_scopus.coverDate)
                except:
                    cover_date.append(np.NaN)
            except:
                printmd(item_CR['DOI'] + '**not found in scopus**. API key and permission correct?')
                item_scopus = None
                cover_date.append(np.NaN)
            doi_list.append(item_CR['DOI'])
            title_list.append(item_CR['title'][0])
            item_list.append(item_CR)
            pub_date.append(pub_date_cross(item_CR))
            first_last.append( first_last_author(item_CR) )
            try:
                if item_scopus.sourcetitle_abbreviation != None:
                    journal_short.append(item_scopus.sourcetitle_abbreviation)
                else:
                    journal_short.append(item_CR['short-container-title'][0])      
            except:
                try:
                    journal_short.append(item_CR['short-container-title'][0])
                except:
                    journal_short.append(np.NaN)
            if pages_condition[element] == True:
                try:
                    if item_scopus.startingPage != None:
                        pub_filled.loc[element,'pages'] = str(item_scopus.startingPage)
                    else:
                        pub_filled.loc[element,'pages'] = item_CR['page'].split('-')[0]
                except:
                    try:
                        pub_filled.loc[element,'pages'] = item_CR['page'].split('-')[0]
                    except:
                        pub_filled.loc[element,'pages'] = np.NaN      
            if volume_condition[element] == True:
                try:
                    pub_filled.loc[element,'volume'] = str(item_CR['volume'])
                except:
                    try:
                        if item_scopus.volume == None:
                            pub_filled.loc[element,'volume'] = np.NaN
                        else:
                            pub_filled.loc[element,'volume'] = str(item_scopus.volume)
                    except:    
                        pub_filled.loc[element,'volume'] = np.NaN
            if number_condition[element] == True:
                try:
                    pub_filled.loc[element,'number'] = item_CR['issue']
                except:
                    try:
                        if item_scopus.issueIdentifier == None:
                            pub_filled.loc[element,'number'] = np.NaN
                        else:
                            pub_filled.loc[element,'number'] = str(item_scopus.issueIdentifier)
                    except:
                        pub_filled.loc[element,'number'] = np.NaN
            if '…' in row['abstract'][20:] or 'Full-Text' in row['abstract'][20:]:
                abstract = get_abstract(item_CR, item_scopus)
                if abstract == None:
                    abstract_list.append(row['abstract'])
                elif len(abstract) > 220:
                    abstract_list.append(abstract)
                else:
                    abstract_list.append(row['abstract'])
            else:
                abstract_list.append(row['abstract'])
        else:
            doi_list.append(None)
            title_list.append(row['title'])
            item_list.append(None)
            journal_short.append(row['journal'])
            pub_date.append(None)
            abstract_list.append(row['abstract'])
            cover_date.append(np.NaN)
            first_last.append(np.NaN)
    pub_filled['DOI'] = np.array(doi_list)
    pub_filled['new_title'] = np.array(title_list)
    pub_filled['item'] = np.array(item_list)
    pub_filled['pub_date'] = np.array(pub_date)
    pub_filled['journal_short'] = np.array(journal_short)
    pub_filled['abstract'] = np.array(abstract_list)
    pub_filled['cover_date'] = np.array(cover_date)
    pub_filled['DOI'] = pub_filled['DOI'].astype(str)
    pub_filled['first_last'] = np.array(first_last)
    pub_filled = clean_abstracts(pub_filled)      # further clean the abstracts (copyright symbols,...)
    return pub_filled


def get_abstract(item_CR, item_scopus):
    """
    Receives abstracts for publication from Crossref and Scopus, given the item found by Scopus and Crossref.
    
    Returns
    -------
        String
            abstract
    """
    abstract = None
    try:
        abstract = item_CR['abstract'].replace('<jats:p> ', '' )
        abstract = abstract.replace('<jats:p>', '' )
        abstract = abstract.replace(' </jats:p>', '')
        abstract = abstract.replace('</jats:p>', '')
        if len(abstract) < 220:
            if item_scopus == None:
                return abstract
            else:
                if item_scopus.description == None:
                    if item_scopus.abstract == None:
                        return abstract
                    else:
                        abstract = item_scopus.abstract
                else:
                    abstract = item_scopus.description
                    return abstract
        return abstract
    except:
        if item_scopus == None:
            return abstract
        else:
            if item_scopus.description == None:
                if item_scopus.abstract == None:
                    return abstract
                else:
                    abstract = item_scopus.abstract
            else:
                abstract = item_scopus.description
    return abstract

def clean_abstracts(df):
    for num in df.index:
        string=''
        txt = df.loc[num].abstract
        if txt[0] == '©' or 'Copyright' in txt[0:15] or 'c○' in txt[:5] or 'COPYRIGHT' in txt[0:15] or 'This journal is' in txt[:30]:
            if txt.find('Weinheim')!= -1:
                string = string + txt[txt.find('Weinheim')+8:] +'\n'
            elif txt.find('Switzerland') !=-1:
                string = string + txt[txt.find('Switzerland')+11:]
            else:
                string = string + txt[txt.find('.')+1:]     
        else:
            if '©' in txt[-100:]:
                string = string + txt[:txt.find('©')]
            else:
                string = string + txt 
        df.loc[num,'abstract'] = string
    return df


def add_pubs_to_file(publications, filename='add.database.papers'):
    """
    Write publications from Google to filenname with the structure needed by the webpage of the chair.
    It aslo returns the urls of the articles with incomplete abstracts.
    'XXX' is written in the database where no data can be found. By searching for 'XXX' later we can complete
    those entries by hand.
    """
    url_abstract_incomplete = []
    publications = publications.replace(np.nan, 'XXX', regex=True)
    seperator = 'enditem\n#################################################\n'   # between each publication
    name_row = ['AG\t\t', 'cvselected\t', 'type\t\t', 'firstlast\t', '#filename\t',
                '#cover\t\t', 'year\t\t', 'title\t\t', 'authors\t\t', 'pubdate\t\t',
                'reference\t', 'DOI\t\t', 'hyperref\t', 'abstract\t', 'enditem']
    
    sep = {'AG':'\t\t', 'cvselected':'\t', 'type':'\t\t' ,
           'firstlast':'\t', 'cover':'\t\t', 'year':'\t\t', 'title':'\t\t',
           'authors':'\t\t', 'pubdate':'\t\t', 'reference':'\t', 'DOI':'\t\t', 
           'abstract':'\t', 'filename':'\t', 'nonmiei':'\t\t', 'ECEMP':'\t\t',
           'Cover':'\t\t', 'publisher':'\t', 'submisdate': '\t', 'ISBN':'\t\t',
           'bbllx':'\t\t', 'bblly':'\t\t', 'bburx':'\t\t', 'bbury':'\t\t',
           'field':'\t\t', 'hyperref':'\t', 'journalfullname':' ', 'journalref': '\t',
           'arxiv':'\t\t', 'arxivmesh':'\t', 'arxivpasswd':' ', 'endpage':'\t\t',
           'refreport_no':'\t', 'report_no':'\t', 'startpage':'\t', 'volume':'\t\t',
           'journalacronym':'\t', 'WCU':'\t\t', 'ISSN':'\t\t', 'issue':'\t\t',
           'page':'\t\t', 'IF':'\t\t', 'acceptdate':'\t', 'revisedate':'\t',
           'webpubdate':'\t', 'ESF':'\t\t', 'ZIH':'\t\t', 'pudate':'\t\t',
           'booktitle':'\t', 'editors':'\t\t', 'order_it':'\t ', 'pubtown':'\t\t',
           'reference_more':'\t', 'pISSN':'\t\t', 'address':'\t\t', 'WCUTOPTEN':'\t',
           'stampamelo':'\t', 'vjntref':'\t\t', 'openaccess':'\t', 'AG2':'\t\t',
           'journalcover':'\t', 'arxivdate':'\t', 'commentcv':'\t', 'addmaterial':' ',
           'receivdate':'\t', 'IFyear':'\t\t', 'PRaccessioncode':' ', 'PRstatusURL':' ',
           'IFref':'\t\t', 'place':'\t\t', 'misure':'\t\t', 'librarycongress':' ',
           'series':'\t\t', 'coverdetails':'\t', 'referenceshort':'\t', 'IEEEcn':'\t\t',
           'INSPECan':'\t', 'econophysics':'\t', 'other_order_it':'\t', 'senzaISBNlink':'\t'}
    with open(filename, 'w') as file:
        for index, row in publications.iterrows():
            try:     # in case 'authors_new' column exist and is not NaN
                if not pd.isna(row['authors_new']) and row['authors_new'] != 'XXX':
                    author_short = row['authors_new']
                else:
                    raise ValueError()
            except:  # if author_new does not contain the formated authors
                     # create from either Google or item
                try:
                    author_short = format_authors_crossref(row['item'])
                except:
                    author_short = format_authors_google(row['author'])
            reference = format_reference(row)      # format the short reference
            new_title = html_coding(row['new_title'])
            abstract = html_coding(row['abstract'])
            if '&hellip;' in abstract[-15:]:
                url_abstract_incomplete.append(row['url'])
            file.write(seperator)
            content = ['GC', 'no', 'typeXXX', row['first_last'], '',
                       '../pubs/reprints/covers/XXX', str(row['year']),
                       new_title, author_short, row['pub_date'],
                       reference, row['DOI'], row['url'], abstract, '']
            for name,value in zip(name_row, content):
                file.write(name + value + '\n')
    return url_abstract_incomplete


def format_reference(row):
    string = ''
    string += '<i>' + html_coding(row['journal_short']) + '</i>'
    try:
        string += ' <b>' + str(int(row['volume'])) + '</b> '
    except:
        string += ' <b>' + str(row['volume']) + '</b> '
    try:
        string += str(int(row['pages'])) + ' (' + str(int(row['year'])) + ')'
    except:
        string += str(row['number']) + ' (' + str(int(row['year'])) + ')'
    return string


def first_last_author(item):
    """ Checks if Cuniberti is either first or last author
    """
    if (item['author'][0]['family']) == 'Cuniberti' or (item['author'][-1]['family']) == 'Cuniberti':
        return 'yes'
    else:
        return 'no'


def open_urls(urls):
    for i, url in enumerate(urls):
        if i == 0:
            webbrowser.open(url)
        else:
            webbrowser.open_new_tab(url)
    return True


def pub_date_cross(item):
    try:
        year, month, day = item['published-online']['date-parts'][0]
    except:
        try:
            year ,month, day = item['issued']['date-parts'][0]
        except:
            return np.NaN
    if len(str(month)) == 1 :
        month = '0' + str(month)
    if len(str(day)) == 1 :
        day = '0' + str(day) 
    return '{0}.{1}.{2}'.format(year, day, month)

    
def format_authors_google(aut,html_encode = False):
    if html_encode == False:
        def html_coding(value):
            return value
    # Firstnames seperated by -
    # Two last names? ???
    string = ''
    author_list = aut  # encode html and non-ascii
    author_list = author_list.split(' and ')
    for i, name in enumerate(author_list):
        surname = name.split(' ')[-1]
        if i == (len(author_list) - 1):
            string = string +' and '  
        if surname == 'Cuniberti':
            string = string + '<b>'
        first_names = name.split(' ')[:-1]
        first_short = '' 
        for first in first_names:
            if first[0] != '-' and first[0] != '–':
                first_short += html_coding(first[0]) + '. '
            else:
                first_short += '-' + html_coding(first[1]) + '. '
        else:
            string = string + first_short + html_coding(surname)
        if surname == 'Cuniberti':
            string = string  # + '</b>' 
        string = string + ', '  
    string = string[:-2]    # removes the last comma
    return string


def write_corrected_database_short(df):
    seperator = 'enditem\n#################################################\n'
    content = ['type', 'firstlast', 'year', 'title', 'authors', 'DOI', 'reference', 'abstract']
    dist = ['\t\t', '\t', '\t\t', '\t\t', '\t\t', '\t\t\t', '\t', ' ']
    with open('database_new', 'w') as file:
        for ind in df.index:    # TODO: simplify if-else clause
            if df.loc[ind]['type'] =='regular' or df.loc[ind]['type'] == 'letter':
                for i, cont in enumerate(content):
                    if pd.isna(df.loc[ind][cont])  or df.loc[ind][cont] == 'None' :
                        pass
                    else:
                        file.write(cont + dist[i] + str(df.loc[ind][cont]) + '\n' )
                file.write(seperator)
            else:
                pass
    return