Skip to content

Instantly share code, notes, and snippets.

@Treblesteph
Forked from blahah/cleanup_bibtex.rb
Last active December 20, 2015 03:49
Show Gist options
  • Save Treblesteph/6066389 to your computer and use it in GitHub Desktop.
Save Treblesteph/6066389 to your computer and use it in GitHub Desktop.
explicit encoding declaration avoids conflicts for non-ASCII chars
#!/usr/bin/env ruby
# encoding: utf-8
# clean up bibtex files by removing unwanted fields,
# replacing troublesome strings and removing bad
# characters
removefields = ['note',
'copyright',
'issn',
'doi',
'shorttitle',
'abstract',
'file',
'urldate',
'url',
'language',
'keywords']
removeregexes = removefields.map { |x| Regexp.new("#{x} =") }
replacements = [[/--/, "-"],
[/(pages = \{\d+) -(\d+\})/, "\\1-\\2"],
[/ó/, "{\\\\'o}"],
[/é/, "{\\\\'e}"],
[/è/, "{\\\\`e}"],
[/ö/, "{\\\\\"o}"],
[/ø/, "{\\\\o}"],
[/ü/, "{\\\\\"u}"],
[/ñ/, "{\\\\~n}"],
[/á/, "{\\\\'a}"],
[/‐/, "-"],
[/‐/, "-"],
[/σ/, "sigma"],
[/ς/, "sigma"],
[/′/, "$^{\\prime}$"]]
removals = [/©/]
file = 'bib.bib'
fixedfile = 'fixedbib.bib'
puts 'scanning file'
removedlines = 0
replaced = 0
removed = 0
File.open('fixed' + file, 'w') do |fixed|
deletenextline = false
indebug = false
debugcount = 0
File.open(file, 'r').each do |line|
# sometimes we want to debug a specific entry
if line =~ /achberger_role_1981/
indebug = true
end
if indebug
puts line
debugcount += 1
indebug = false if debugcount >= 20
end
# always include the start and end lines of a bib entry
moveon = false
[/^@/, /^\}$/].each do |keeper|
if line =~ keeper
deletenextline = false
fixed << line
moveon = true
end
end
next if moveon
# does the line end in {escaped} text?
escaped = line =~ /\{[^\}]+\}$/
# are we in the middle of a field that needs deleting?
if deletenextline
if line.rstrip.end_with?('},') || line.rstrip =~ /^\}\,/
deletenextline = false
end
next
end
keep = true
# check for bad fields
removeregexes.each do |field|
if line =~ field
keep = false
removedlines += 1
deletenextline = true unless line.rstrip.end_with?('},')
end
end
next unless keep
# perform any replacements
replacements.each do |pattern, replacement|
replaced += 1 if line.gsub!(pattern, replacement)
end
# strip bad chars
removals.each do |pattern|
removed += 1 if line.gsub!(pattern, '')
end
# save
puts line if indebug
fixed << line
end
end
puts "removed #{removedlines} bad lines, replaced #{replaced} bad strings and removed #{removed} bad characters!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment