#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Create BibTeX entry for IETF documents (RFCs and Internet Drafts)
# Copyright (C) 2014-2022 by Thomas Dreibholz
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Contact: thomas.dreibholz@gmail.com
#

from xml.dom.minidom import parse
import codecs
import csv
import hashlib
import os
import re
import sys
import urllib.request


datatrackerPrefix = 'https://datatracker.ietf.org/doc/'
draftPrefix       = 'https://bib.ietf.org/public/rfc/bibxml3/reference.I-D.'
rfcPrefix         = 'https://bib.ietf.org/public/rfc/bibxml/reference.RFC.'
verboseMode       = True
authorsFixFile    = 'DEFAULT'


# Documents where meta-information XML is wrong:
blacklistTable = [
   'draft-ahmed-lssctp',
   'draft-adams-qos-broadband',
   'draft-lopez-mptcp-middlebox',
   'draft-wei-mptcp-proxy-mechanism',
   'draft-ietf-rserpool-api',
   'draft-ietf-rserpool-applic',
   'draft-ietf-rserpool-arch',
   'draft-ietf-rserpool-comp',
   'draft-ietf-rserpool-mib',
   'draft-ietf-rserpool-tcpmapping',
   'draft-iyengar-sctp-cacc',
   'draft-xue-mptcp-tmpp-unware-hosts'
]

monthIDTable   = [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun',
                   'jul', 'aug', 'sep', 'oct', 'nov', 'dec' ]
monthNameTable = [ 'January', 'February', 'March', 'April',
                   'May', 'June', 'July', 'August',
                   'September', 'October', 'November', 'December',
                   'INVALID' ]


# ###### Fetch URL, with caching ############################################
def fetch(url):
   urlHash = hashlib.sha256(url.encode('utf-8')).hexdigest()
   cacheFileName = '/tmp/ietf2bibtex-' + urlHash + '.data'

   if not os.path.exists(cacheFileName):
      if verboseMode == True:
         sys.stderr.write('Fetching ' + url + ' into ' + cacheFileName + ' ...\n')
      inputFile = urllib.request.urlretrieve(url, cacheFileName)
   else:
      if verboseMode == True:
         sys.stderr.write('Using cached ' + url + ' from ' + cacheFileName + ' ...\n')

   return open(cacheFileName, 'rb')


# ###### Get URL of I-D metadata ############################################
def getMetadata(document):
   authorList = [ ]

   # ====== Find latest version =============================================
   if not re.match(r'^.*-[0-9][0-9]$', document):
      datatrackerURL = datatrackerPrefix + document + '/99/bibtex/'
      datatrackerStatus = fetch(datatrackerURL).readlines()

      numberRE = re.compile(r'^(number[ \t]*=[ \t]*\{)([^ ]*)-([0-9]+)\},$')
      authorRE = re.compile(r'^([ \t]*author[ \t]*=[ \t]*{)(.*)(})')
      for line in datatrackerStatus:
         line = line.decode('utf-8').strip()
         match = numberRE.match(line)
         if match != None:
            name           = match.group(2)
            lastestVersion = match.group(3)
            document = name + '-' + lastestVersion
            sys.stderr.write('Latest version of ' + name + ' is ' + lastestVersion + ' => ' + document + '\n')

         # ====== Find author list ==========================================
         else:
            match = authorRE.match(line)
            if match != None:
               names = match.group(2).split(' and ')
               for name in names:
                  fullname = re.sub('^(Professor |Dr\. )', '',
                                re.sub(' ([A-Z]) ', ' \\1. ',
                                   re.sub('(\.)([A-Z])', '. \\2', name)))
                  n = fullname.split(' ')
                  surname  = n[len(n) - 1]
                  initials = ''
                  for i in range(0, len(n) - 1):
                     initials = initials + n[i][0] + '.'
                  # if (initials == '') or (fullname.find('@') != -1):
                  #    sys.stderr.write('WARNING: Bad author entry in ' + document + ': ' + fullname + '\n')
                  authorList.append({
                      'fullname': fullname,
                      'initials': initials,
                      'surname':  surname
                  })
               # print(authorList)

   url = draftPrefix + document + '.xml'

   return [ url, authorList ]


# ###### Get BibTeX from IETF document ######################################
def getBibTeXForIETFDocument(document):
   for blacklistEntry in blacklistTable:
      if document == blacklistEntry:
         sys.stderr.write('NOTE: ' + document + ' is blacklisted due to bad metadata in XML file. Skipping.\n')
         return

   issn               = None
   documentType       = None
   number             = None
   institution        = None
   language           = 'english'
   note               = None
   authorList         = None
   authorRegExpList   = []
   unsortedAuthorList = None
   sortedAuthorList   = None
   fullAuthorList     = []


   # ====== Get information from BibTeX file ================================
   # The IETF-provided BibTeX file has:
   # - Information about the latest version
   # - More accurate author information, but unordered
   # - But *not* the full date
   url = ''
   m = re.match('^draft-(.*)$', document)
   if m != None:
      [ url, unsortedAuthorList ] = getMetadata(document)
      documentType = 'Internet Draft'
      number       = m.group(1)
      m = re.match('^draft-ietf-(.*)$', document)
      if m != None:
         institution  = 'IETF'
      else:
         institution  = 'IETF, Individual Submission'
      # note = 'Work in Progress'

   if url == '':
      m = re.match('^[Rr][Ff][Cc]([0-9]*)$', document)
      if m != None:
         [ url, unsortedAuthorList ] = getMetadata(document)
         documentType = 'RFC'
         number       = int(m.group(1))
         issn         = '2070-1721'
         institution  = 'IETF'
         document     = documentType + str(number)
         url          = rfcPrefix + '{0:04d}'.format(number) + '.xml'

   if url == '':
      sys.stderr.write('ERROR: Unknown document type ' + document + '!\n')
      sys.exit(1)


   # ====== Get document information from XML file ==========================
   # The IETF-provided XML file has:
   # - Quite inaccurate author information, only ASCII names
   # - Inaccurate date, but ordered
   # - Type information
   try:
      xmlFile = fetch(url)
      xmlDocument = parse(xmlFile)
   except Exception as e:
      sys.stderr.write('ERROR: Failed to get XML information from ' + url + ': ' + str(e) + '!\n')
      sys.exit(1)

   # ------ Get authors list ------------------------------------------------
   # The BibTeX file information is invalid, so use the less accurate
   # information from the XML file instead:+

   # Try to read the actual draft XML file first:
   frontList = xmlDocument.getElementsByTagName('front')

   if frontList != None:
      authorList = frontList[0].getElementsByTagName('author')
   else:
      # Else, try to read the reference XML file:
      authorList = xmlDocument.getElementsByTagName('author')
   if len(authorList) == 0:
      sys.stderr.write('ERROR: It seems that the XML file ' + url + ' does not exist. Wrong document name?\n')
      sys.exit(1)

   for author in authorList:
      if not 'fullname' in author.attributes.keys():
         continue
      # print(author.toxml())
      # print('fullname=', author.attributes['fullname'].value)
      # print('initials=', author.attributes['initials'].value[0])
      # print('surname=',  author.attributes['surname'].value)
      if author.attributes['fullname'].value.find('@') != -1:
         sys.stderr.write('ERROR: Bad author entry in ' + document + ': ' + author.attributes['fullname'].value + '\n')
         sys.exit(1)
      if author.attributes['fullname'].value[1] == '.':
         authorRegExp = author.attributes['initials'].value[0] + '[a-zA-Z][a-zA-Z\s\.\- ]*' + \
                        author.attributes['surname'].value
      else:
         authorRegExp = author.attributes['fullname'].value
      authorRegExpList.append(authorRegExp)
      fullAuthorList.append(author.attributes['fullname'].value)
   # print('authorRegExpList=', authorRegExpList)
   # print('fullAuthorList=',   fullAuthorList)

   # ------ Get sorted author list from unsorted author list ----------------
   if unsortedAuthorList != None:
      # sortedAuthorList = [ ]
      fullAuthorList   = [ ]
      for author in authorList:
         if not 'fullname' in author.attributes.keys():
            continue
         found = False
         for a in unsortedAuthorList:
            if (('surname' in author.attributes.keys()) and
                ('initials' in author.attributes.keys()) and
                (author.attributes['surname'].value == a['surname']) and
                (author.attributes['initials'].value[0] == a['initials'][0])):
               # sortedAuthorList.append({
               #       'fullname': a['fullname'],
               #       'initials': a['initials'],
               #       'surname':  a['surname']
               #    })
               # sortedAuthorList.append(author)
               fullAuthorList.append(a['fullname'])
               found = True
               break

         if not found:
            # sortedAuthorList.append({
            #       'fullname': author.attributes['fullname'].value,
            #       'initials': author.attributes['initials'].value,
            #       'surname':  author.attributes['surname'].value
            #    })

            fullAuthorList.append(re.sub(' ([A-Z]) ', ' \\1. ',
                                     re.sub('(\.)([A-Z])', '. \\2',
                                        re.sub('^([A-Z]) ', '\\1. ', author.attributes['fullname'].value))))
   # print('fullAuthorList=',   fullAuthorList)


   # ------ Get date --------------------------------------------------------
   dateList = xmlDocument.getElementsByTagName('date')
   month    = 0
   try:
      for monthName in monthNameTable:
         month = month + 1
         if monthName == dateList[0].attributes['month'].value:
            break
   except:
      month = 0
   try:
      day = int(dateList[0].attributes['day'].value)
   except:
      day = 0
   try:
      year = int(dateList[0].attributes['year'].value)
   except:   # Some RFCs have some garbage before the year content.
      year = dateList[0].attributes['year'].value
      l = len(year)
      year = int(year[l-4:l])
   # print(day,month,year)

   # ------ Get title -------------------------------------------------------
   titleList = xmlDocument.getElementsByTagName('title')
   title = titleList[0].childNodes[0].data
   title = title.strip().replace(' (', '~(')
   title = title.strip().replace(' - ', ' -- ')
   # print(title)

   # ------ Get type --------------------------------------------------------
   seriesInfoList = xmlDocument.getElementsByTagName('seriesInfo')
   series = None
   name   = None
   doi    = None
   for seriesInfo in seriesInfoList:
      if seriesInfo.attributes['name'].value in ['RFC', 'Internet-Draft']:
         series = seriesInfo.attributes['name'].value
         name   = seriesInfo.attributes['value'].value
      elif seriesInfo.attributes['name'].value == 'DOI':
         doi = seriesInfo.attributes['value'].value
   #print(series, name, doi)

   # ------ Get URL ---------------------------------------------------------
   formatList = xmlDocument.getElementsByTagName('format')
   try:
      url = formatList[0].attributes['target'].value
   except:
      if documentType == 'RFC':
         url = 'https://tools.ietf.org/rfc/rfc' + name + '.txt'
      else:
         url = 'https://tools.ietf.org/id/' + name + '.txt'

   # For I-Ds, use a secure URL instead of HTTP:
   url = url.replace('http://www.rfc-editor.org/rfc/',
                     'https://www.rfc-editor.org/rfc/')
   url = url.replace('http://www.ietf.org/internet-drafts/',
                     'https://tools.ietf.org/id/')
   # Fix redirect URL:
   url = url.replace('https://www.ietf.org/internet-drafts/',
                     'https://tools.ietf.org/id/')

   # ------ Get abstract ----------------------------------------------------
   abstractList = xmlDocument.getElementsByTagName('t')
   abstract     = None
   try:
      abstract = abstractList[0].childNodes[0].data
   except:
      pass
   # print('A=',abstract,'.')

   if abstract != None:
      abstract = abstract.strip().replace('\t', ' ')
      abstract = abstract.strip().replace('\n', ' ')
      abstract = abstract.strip().replace(' (', '~(')
      abstract = abstract.strip().replace(' - ', ' -- ')
      abstract = abstract.strip().replace('i.e. ', 'i.e.\ ')
      abstract = abstract.strip().replace('e.g. ', 'e.g.\ ')
      abstract = abstract.strip().replace(' [STANDARDS-TRACK]','')
      abstract = re.sub(r'(RFC|STD)( )([0-9]+)', r'\1~\3', abstract.strip())

      abstract = abstract.replace('</t><t>', '\n')
      while True:
         oldLength = len(abstract)
         abstract = abstract.strip().replace('  ', ' ')
         newLength = len(abstract)
         if oldLength == newLength:
            break

   # ====== Get full author list from text file ================================
   try:
      txtFile = fetch(url)
   except Exception as e:
      sys.stderr.write('ERROR: Failed to get document from ' + url + ': ' + str(e) + '!\n')
      sys.exit(1)

   inAuthorsSection = False
   foundDate        = False
   line             = txtFile.readline()
   lineNumber       = 0
   while line:
      lineNumber = lineNumber + 1

      # ====== RFC category =================================================
      if lineNumber < 30:
         if re.search('^Category:', line.decode().strip()):
            m = re.match('^Category: (.*)[ ][ ].*$', line.decode().strip())
            if m != None:
               category = m.group(1).strip()
               if documentType == 'RFC':
                  if category != 'Experimental':
                     documentType = category + ' RFC'
         if foundDate == False:
            m = re.match('^.*(January|February|March|April|May|June|July|August|September|October|November|December)( [0-9]+[,]|)[ ]*([0-9][0-9][0-9][0-9])$', line.decode().strip())
            # print(m,line.decode().strip())
            if m != None:
               foundDate = True

               # ====== Get year ============================================
               newYear = int(m.group(3))
               if newYear != year:
                  if verboseMode == True:
                     sys.stderr.write('Fix: Year ' + str(year) + ' -> ' + str(newYear) + '\n')
                  year = newYear

               # ====== Get month ===========================================
               newMonth = 0
               for monthName in monthNameTable:
                  newMonth = newMonth + 1
                  if monthName == m.group(1):
                     break
               if newMonth != month:
                  if verboseMode == True:
                     sys.stderr.write('Fix: Month ' + str(month) + ' -> ' + str(newMonth) + '\n')
                  month = newMonth

               # ====== Get day =============================================
               m2 = re.match('.* ([0-9]+) ' + monthNameTable[month - 1] + ' ' + str(year) + '$', line.decode().strip())
               if m2 != None:   #   Format: <Day> <Month>, <Year>
                  newDay = int(m2.group(1))
               else:   # Format: <Month> <Day>, <Year>
                  try:
                     newDay = int(m.group(2).replace(',', ''))
                  except:
                     newDay = 0
               if ((newDay != 0) and (newDay != day)):
                  if verboseMode == True:
                     sys.stderr.write('Fix: Day ' + str(day) + ' -> ' + str(newDay) + '\n')
                  day = newDay

            # ====== Author =================================================
            else:
               # Author is right-adjusted block
               block = re.sub('.*[ \t][ \t]', '', line.decode().strip())
               i = 0
               for author in authorRegExpList:
                  m = re.match('^(.*)(' + author + ')([\t ]*|$)', block)
                  # print(line.strip(), i, m, author)
                  if m != None:
                     if len(m.group(2)) >= len(fullAuthorList[i]):
                        fullAuthorList[i] = str(m.group(2))
                        # print('!!! A1: ', line.strip(), i, m, author)
                  i = i + 1


      # ====== Authors section ==============================================
      if len(authorRegExpList) < 1:
         line = txtFile.readline()
         continue
      if inAuthorsSection == True:
         try:
            # Sometimes, there may be some strange letters inside -> decode fails.
            d = line.decode().strip()
         except:
            d=''   # Just ignore not-decodable stuff for now.

         # ------ Check for end of author section ---------------------------
         if re.search('^[0-9\.]*  ', d):
            # print('End of Author Section:', line)
            inAuthorsSection = False

         # ------ Parse author section --------------------------------------
         elif not re.search('^[\s]*$', d):
            i = 0
            for author in authorRegExpList:
               m = re.match('^(.*)(' + author + ')([\t ].*|,.*|)$', line.decode().strip())
               # print(line.strip(), i, m, author)
               if m != None:
                  if len(m.group(2)) >= len(fullAuthorList[i]):
                     fullAuthorList[i] = str(m.group(2))
                     # print('!!! A2: ', line.strip(), i, m, author)
               i = i + 1

      else:
         try:
            m = re.search('(^([0-9\.]*|[A-Z]\.)[ \t]+.*|^)(Editors|Authors|Authors\'[ \t]+Addresses|Author\'s[ \t]+Address|Editor\'s[ \t]+Address|Author[ \t]+Information).*$', line.decode().strip(), re.IGNORECASE)
            if m != None:
               # Check whether line is in table of contents
               m2 = re.search('\. \.[ \t]*[0-9]+$', line.decode().strip())
               if m2 == None:
                  # print('Begin of Author Section:', line)
                  inAuthorsSection = True
         except:
            pass

      line = txtFile.readline()


   # ====== Fix author list via author fix file ================================
   csv.register_dialect('authors-fix', delimiter=' ', quoting=csv.QUOTE_MINIMAL, skipinitialspace=True)
   inputCSV = None
   if authorsFixFile == 'DEFAULT':
      authorsFixFileList = [ 'authors-fix.list', '/usr/share/doc/bibtexconv/examples/authors-fix.list' ]
   elif authorsFixFile != None:
      authorsFixFileList = [ authorsFixFile ]
   else:
      authorsFixFileList = [ ]
   for authorsFixFileName in authorsFixFileList:
      if os.path.exists(authorsFixFileName):
         try:
            inputFile = codecs.open(authorsFixFileName, 'r', 'utf-8')
            inputCSV  = csv.reader(inputFile, dialect='authors-fix')
            break
         except:
            pass

   if ((authorsFixFile != None) and (inputCSV == None)):
      sys.stderr.write('ERROR: Unable to load authors fix file. Use option --authors-fix-file=FILE or --no-authors-fix-file!\n')
      sys.exit(1)
   elif inputCSV != None:
      if verboseMode == True:
         sys.stderr.write('NOTE: Using authors fix file ' + authorsFixFileName + '\n')

      for row in inputCSV:
         for i in range(0, len(fullAuthorList), 1):
            if row[0] == fullAuthorList[i]:
               if verboseMode == True:
                  sys.stderr.write('Fix(' + authorsFixFileName + '): ' + fullAuthorList[i] + ' -> ' + row[1] + '\n')
               fullAuthorList[i] = row[1]
               break


   # ====== Check ===========================================================
   if foundDate == False:
      sys.stderr.write('WARNING: Unable to find date of document ' + document + '!\n')


   # ====== Print result ====================================================
   sys.stdout.write('@TechReport{ ' + document + ',\n')
   sys.stdout.write('\tauthor = "')
   isFirstAuthor = True
   for author in fullAuthorList:
      if isFirstAuthor == True:
        isFirstAuthor = False
      else:
         sys.stdout.write(' and ')
      sys.stdout.write(author)
   sys.stdout.write('",\n')

   sys.stdout.write('\ttitle = "{' + title + '}",\n')
   sys.stdout.write('\ttype = "' + documentType + '",\n')
   sys.stdout.write('\tnumber = "' + name + '",\n')

   if day > 0:
      sys.stdout.write('\tday = "' + str(day) + '",\n')
   if ((month >= 1) and (month <= 12)):
      sys.stdout.write('\tmonth = ' + monthIDTable[month - 1] + ',\n')
   sys.stdout.write('\tyear = "' + str(year) + '",\n')

   sys.stdout.write('\tinstitution = "' + institution + '",\n')
   sys.stdout.write('\tlanguage = "' + language + '",\n')
   if issn != None:
      sys.stdout.write('\tissn = "' + issn + '",\n')
   if doi != None:
      sys.stdout.write('\tdoi = "' + doi + '",\n')
   if note != None:
      sys.stdout.write('\tnote = "' + note + '",\n')
   if abstract != None:
      sys.stdout.write('\tabstract = "{' + abstract + '}",\n')
   sys.stdout.write('\turl = "' + url + '"\n')

   sys.stdout.write('}\n\n')



# ###### Handle arguments ###################################################
if len(sys.argv) < 2:
   sys.stderr.write('Usage: ietf2bibtex [--quiet|--verbose] [--authors-fix-file=FILE|--no-authors-fix-file] document ...\n')
   sys.exit(1)

for i in range(1, len(sys.argv)):
   if sys.argv[i] == '--quiet':
      verboseMode = False
   elif sys.argv[i] == '--verbose':
      verboseMode = True
   elif sys.argv[i] == '--no-authors-fix-file':
      authorsFixFile = None
   elif sys.argv[i][0:19] == '--authors-fix-file=':
      authorsFixFile = sys.argv[i][19:]
      i = i + 1
   else:
      getBibTeXForIETFDocument(sys.argv[i])
