Cleaning up a .bib file

I would recommend bibexport script. It creates a new .bib file that includes only the references you cite in the .tex file, and cleans them up. I use it for submissions to journals, when I want to send only the relevant references rather than my databases.

Also JabRef has some duplicate search and resolve capabilities which can be accessed via the menu items shown (for v2.8) below:

enter image description here

This is a python code that automatically deal with some of these issues. Please feel free to give feedback or suggestions to enhance or extend the features of this code. Hope it can help.

# perform some tests on ped.bib related to the pdf-directory (pdfdir)
# activate one or more tests by setting those control variables to 1

# 1. is_check_file: check objects with no or empty file entry
# 2. is_check_double_file: check if two or more different objects have the same pdf entry
# 3. is_check_unused_files: check <pdfdir> for unsused files

import os, sys, glob, copy
from os import path, access, R_OK  # W_OK for write permission.
from operator import itemgetter

#-------- control parameter ---------
is_check_file = 0
is_check_double_file = 0
is_check_unused_files = 1
#------------------------------------   

ROOT = os.getenv("HOME") # Home directory
#ROOT = path.expanduser("~") # works on all platforms  

pdfdir = ROOT + '/lit/pdf/'
print 'pdfdir', pdfdir
debug = 0 # shutdown debug/info messages

bib_data = open('ped.bib')

words = ['author', 'title', 'journal', 'year', 'volume', 'comment', 'issue', 'owner', 'file', 'timestamp', 'booktitle', 'editor', 'publisher', 'number', 'part', 'keywords', 'doi', 'month', 'organization', 'url']
#------------------------------------------------------------------------------

def check_missing_pdf(element):
    """
    following testes:
    1. element has no file entry
    2. element has empty file entry
    3. element has file entry with inexistant pdf-file
    """
    #check missing files
    if not element.has_key('file'): # element has no file
        print >>sys.stderr, '==== %s has no file entry'%element.get('key')
    elif not element.get('file'):  # .. or file is empty
            print >>sys.stderr, '**** %s has empty file entry'%element.get('key')
    else:
        # we have a file entry -----> check its existance in the pdf dir
        pdffile = pdfdir + element.get('file')
        if not( path.exists(pdffile) and path.isfile(pdffile) and access(pdffile, R_OK)):
            print >>sys.stderr, '#### %s with file entry [%s]: Either file is missing or is not readable'%(element.get('key'), element.get('file'))

#------------------------------------------------------------------------------

def check_doubles(elements):
    """
    check if two or more different objects have the same pdf-file 
    """
    doubles = {}
    for element in elements:
        pdffile = element.get("file")
        key = element.get("key")
        if not doubles.has_key(pdffile):
            doubles[pdffile] = [key]
        else:
            doubles[pdffile].append(key)
    for f, k in doubles.iteritems():
        if f and len(k) > 1: # if f excludes case f == None
            print >>sys.stderr, "Keys:", k, "have the same file <%s>"%f
#------------------------------------------------------------------------------

def check_unused_files(elements):
    """
    check dir pdfdir ("lit/pdf") for pdf-files that are not used
    in the ped.bib
    """
    pdf_files = glob.glob( pdfdir + "*.pdf") # pdfs in lit/pdf/
    dummy_files = copy.copy(pdf_files) # list of unused files
    for pdf in pdf_files:
        for element in elements:
            element_pdf =  element.get("file")
            if element_pdf is None:
                continue
            element_pdf = path.basename(element_pdf)

            if path.basename(pdf) == element_pdf:
                dummy_files.remove(pdf)
                break # check next files

    if dummy_files:
        print >>sys.stderr, "%d files are not used:"%len(dummy_files)
        for f in dummy_files:
            print >>sys.stderr, "---->",path.basename(f)
#------------------------------------------------------------------------------

def putWord(string, dic, line):
    """
    extract from <line> the value of the key <string> and put it in  <dic>
    """
    tmp = line[1].strip(' { } , .').split(':')
    # some files are like this :llll:aaaa. So tmp[0] is here == ''
    if not tmp[0]: 
        dic[string] = tmp[1]
    else:
        dic[string] = tmp[0]
#------------------------------------------------------------------------------

def getElement(f):
    """
    get ONE element from file f.
    return dict
    """
    dic = {}
    for line in f:
        line = line.strip(' \n\r')
        if not line:
            continue
        #get <key> and <type>
        if line[0] == '@':
            sline = line.split('{')
            typ = sline[0][1:]
            if typ == 'comment': # ignore jabref-meta
                continue
            dic['type'] = typ.strip(',') 
            key = sline[1].strip(',')
            if debug:
                print >> sys.stderr, '--------> type: <%s>'%typ
                print >> sys.stderr, '--------> key: <%s>'%key
            dic['key'] = key

        line = line.split('=')

        for word in words:
            if line[0].strip(' ') == word:
                putWord(word, dic, line)
                if debug:
                    print  >> sys.stderr, '--------> %s: <%s>'%(word, line[1].strip(' { },.') ) 

        # check for last line of element
        if line[0] == '}':
            if debug:
                print >> sys.stderr, '---------------------------------'
            return dic
#------------------------------------------------------------------------------


#----------------------- get content of file in elements ------------------------------
elements = []
while True:
    dic = getElement(bib_data) 
    if not dic:
        sorted(elements, key=itemgetter('key')) 
        break
    elements.append( dic )
#------------------------------------------------------------------------------

if is_check_file:
    print "check missing files ..."
    for element in elements:
        check_missing_pdf(element)

if is_check_double_file:
    print "check double files ..."
    check_doubles(elements)

if is_check_unused_files:
    print "is_check_unused_files ..."
    check_unused_files(elements)

You can use checkcites to get a list of all unused references via your .aux file. To do this simply open your terminal and type:

checkcites --unused document.aux

checkcites

Cleaning up a .bib file

Tags:

Bibtex

Related

Recent Posts