-
-
Save Treblesteph/6066389 to your computer and use it in GitHub Desktop.
explicit encoding declaration avoids conflicts for non-ASCII chars
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# encoding: utf-8 | |
# clean up bibtex files by removing unwanted fields, | |
# replacing troublesome strings and removing bad | |
# characters | |
removefields = ['note', | |
'copyright', | |
'issn', | |
'doi', | |
'shorttitle', | |
'abstract', | |
'file', | |
'urldate', | |
'url', | |
'language', | |
'keywords'] | |
removeregexes = removefields.map { |x| Regexp.new("#{x} =") } | |
replacements = [[/--/, "-"], | |
[/(pages = \{\d+) -(\d+\})/, "\\1-\\2"], | |
[/ó/, "{\\\\'o}"], | |
[/é/, "{\\\\'e}"], | |
[/è/, "{\\\\`e}"], | |
[/ö/, "{\\\\\"o}"], | |
[/ø/, "{\\\\o}"], | |
[/ü/, "{\\\\\"u}"], | |
[/ñ/, "{\\\\~n}"], | |
[/á/, "{\\\\'a}"], | |
[/‐/, "-"], | |
[/‐/, "-"], | |
[/σ/, "sigma"], | |
[/ς/, "sigma"], | |
[/′/, "$^{\\prime}$"]] | |
removals = [/©/] | |
file = 'bib.bib' | |
fixedfile = 'fixedbib.bib' | |
puts 'scanning file' | |
removedlines = 0 | |
replaced = 0 | |
removed = 0 | |
File.open('fixed' + file, 'w') do |fixed| | |
deletenextline = false | |
indebug = false | |
debugcount = 0 | |
File.open(file, 'r').each do |line| | |
# sometimes we want to debug a specific entry | |
if line =~ /achberger_role_1981/ | |
indebug = true | |
end | |
if indebug | |
puts line | |
debugcount += 1 | |
indebug = false if debugcount >= 20 | |
end | |
# always include the start and end lines of a bib entry | |
moveon = false | |
[/^@/, /^\}$/].each do |keeper| | |
if line =~ keeper | |
deletenextline = false | |
fixed << line | |
moveon = true | |
end | |
end | |
next if moveon | |
# does the line end in {escaped} text? | |
escaped = line =~ /\{[^\}]+\}$/ | |
# are we in the middle of a field that needs deleting? | |
if deletenextline | |
if line.rstrip.end_with?('},') || line.rstrip =~ /^\}\,/ | |
deletenextline = false | |
end | |
next | |
end | |
keep = true | |
# check for bad fields | |
removeregexes.each do |field| | |
if line =~ field | |
keep = false | |
removedlines += 1 | |
deletenextline = true unless line.rstrip.end_with?('},') | |
end | |
end | |
next unless keep | |
# perform any replacements | |
replacements.each do |pattern, replacement| | |
replaced += 1 if line.gsub!(pattern, replacement) | |
end | |
# strip bad chars | |
removals.each do |pattern| | |
removed += 1 if line.gsub!(pattern, '') | |
end | |
# save | |
puts line if indebug | |
fixed << line | |
end | |
end | |
puts "removed #{removedlines} bad lines, replaced #{replaced} bad strings and removed #{removed} bad characters!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment