#! /usr/local/bin/perl $Script = 'tex2bib'; $Version=0.95; # 5 Feb 1997 15:08:00 $Author='Michael Friendly (friendly@yorku.ca)'; # tex2bib # Input a TeX document containing \bibitems, translate these # to BibTeX format # # Usage: # tex2bib [-k][-i infile] [-o outfile] # -k: regenerate keys # if infile not given, reads from stdin # if outfile not given, prints to stdout # The entire tex document is scanned for \bibitems, ending when # the string '\end{thebibliography}' is read. # Assumes that bibitems are formatted as follows: # -- {key}author(s), (date) at the beginning # -- titles of books or names of journals: {\em title} # -- article titles: after date, `` '' quotes optional # -- volume, pages: {\it vol}, nnn-nnn. # -- publisher/address: address:publisher # # All text in the bibitem which cannot be parsed is included # in a note = { } field ################################################################# # Examples of a book, article, inproceedings: #\bibitem{Bertin83}Bertin, J. (1983), # {\em Semiology of Graphics} (trans. W. Berg). Madison, WI: # University of Wisconsin Press. # #\bibitem{Bickel75}Bickel, P. J., Hammel, J. W. and O'Connell, J. W. # (1975). # Sex bias in graduate admissions: data from Berkeley. {\em # Science}, {\it 187}, 398-403. # #\bibitem{Farebrother87}Farebrother, R. W. (1987), # ``Mechanical representations of the ${L}_1$ and ${L}_2$ estimation # problems'', In Y. Dodge (ed.) {\em Statistical data analysis # based on the L1 norm and related methods}, Amsterdam: # North-Holland., 455-464. # These are output as: #@Book{ Bertin:83, # author = {J. Bertin}, # year = 1983, # title = {Semiology of Graphics}, # publisher = {University of Wisconsin Press}, # address = {Madison, WI}, # note = {(trans. W. Berg).} #} #@Article{ Bickel:75, # author = {Bickel, P. J. and Hammel, J. W. and O'Connell, J. W.}, # year = 1975, # title = {Sex Bias in Graduate Admissions: Data from Berkeley}, # journal = {Science}, # volume = 187, # pages = {398-403} #} #@InCollection{ Farebrother:87, # author = {R. W. Farebrother}, # year = 1987, # title = {Mechanical Representations of the ${L}_1$ and ${L}_2$ Estimation Problems}, # booktitle = {Statistical Data Analysis Based on the L1 Norm and Related Methods}, # editor = {Y. Dodge}, # publisher = {North-Holland}, # address = {Amsterdam}, # pages = {455-464} # Text in a bibitem is removed from the bibitem as it is assigned to # bibtex fields. Any text remaining is assigned to a note={ } field # at the end. Since the parsing is heuristic, some manual fixup work # can be expected at the end. ################################################################# require 'getopt.pl' ; Getopt("oik"); # ARGV now contains [inputfile] and # $opt_o, $opt_i might be set. open(STDIN, "<$opt_i") or die "-i $opt_i: can't open.\n" if $opt_i ; open(STDOUT, ">$opt_o") or die "-o $opt_o: can't create.\n" if $opt_o ; ####################### Process input files #################### $bibs=0; while (<>) { /^\s*\\end\{thebibliography/ && last; if (/\\bibitem\s*\{([\w]+)\}/) { $bibs++; $key = $1; # print STDERR "$bibs key: $key\n"; } # skip if we're still reading tex text (no bibitems encountered) next unless $bibs; chomp; s/^\s*/ /; $lines .= $_; } # All lines containing \bibitems have been read into the $lines string. # Now, split into separate items @items = split(/\\bibitem/, $lines); print STDERR "items has ", scalar(@items), " items\n"; @months = qw(january february march april may jun july august september october november december); $month_pat = join('|', @months); foreach (@months) { $month_abbrv{$_} = substr($_, 0,3); } $ordinal = 'first|second|third|fourth|1st\.?|2nd\.?|3rd\.?|4th\.?'; # title words to ignore for casing; foreach ( qw(a about an and by for from in of on the to von with) ) { $ignore_case{$_} = 1; } $outitems =0; %keys_seen = {}; foreach $i (0..$#items) { undef $title; undef $pages; undef $journal; undef $volume; undef $number; undef $booktitle; undef $editor; undef $edition; undef $month; undef $publisher; undef $address; undef $chapter; undef $rest; # print "$items[$i]\n"; # assume each entry starts with # {key}Author, F.I. etc (date[a-z]?) # $items[$i] =~ /^\{([^}]+)\}([^\(]*)\((\d{4})[a-z]?\)[,.]?\s*/; # ($key, $authors, $date) = ($1, $2, $3); $items[$i] =~ /^\{([^}]+)\}\s*/; $key = $1; $rest = $'; $rest =~ /\((\d{4})[a-z]?\)[,.]?\s*/ && do { $authors = $`; $date = $1; $rest = $'; }; unless (defined($key)) { print STDERR "Skipping: $authors, $date in \n$items[$i]\n"; next; } $rest =~ s/^[.,]?\s+//; $rest =~ s/\s+$//; $key =~ m/(['A-Za-z]+)([^:]*)/ && do { $key = "$1:$2"; }; &parse_authors; $orig_key = $key; $key = &generate_key if $opt_k or $keys_seen{$key}; # if the $key has already been seen, make a new, unique one if ($keys_seen{$key}) { foreach $suffix (('b'..'z')) { $try = $key . $suffix; # print STDERR "Seen $key ... trying $try\n"; unless ($keys_seen{$try}) {$key = $try; last; }; } print STDERR "New unique key generated: $key (was $orig_key)\n" unless $key eq $orig_key; } $keys_seen{$key}++; # assume it's an article at first $type = '@Article'; if ($rest =~ /^\{\\em\s+/) { #tech report or book $type = '@Book'; $rest =~ m/^\{\\em\s+(['\w ,?:]+)\}[., ]*/; $title = $1; $rest = $'; &parse_edition; &parse_publish; $rest =~ m/\sreport(s?)[,]?\s*/i && do { $type ='@TechReport'; $rest = $` . $'; &parse_report_number; }; } else { #does it begin with quoted title? if ($rest =~ /^``/){ $rest =~ m/^``([^`]+)''[., ]*/; $title = $1; $rest = $'; } else { $rest =~ m#^(['\{\}\w\d,\(\)\\ :\?/-]+)[. ]*#; $title = $1; $rest = $'; } $rest =~ s/^[,\s]+//; if ($rest =~ /^\{\\em\s+/) { $rest =~ m/^\{\\em\s+([\w\\& ,'-?]+)\}[., ]*/; $journal = $1; $rest = $'; } # parse {\it volume (number)} &parse_volume; $rest =~ m/,?\s*(\d{1,4}--?\d{1,4})[.,]?\s*/ && do { $pages = $1; $rest = $`. $'; $pages =~ s/(\d)-(\d)/$1--$2/; }; $rest =~ m/\(($month_pat)\)/i && do { ($month = $1) =~ tr/A-Z/a-z/; $month = $month_abbrv{$month}; $rest = $` . $'; }; $rest =~ m/(technical|tech\.)\s+report[,.]?/i && do { $type ='@TechReport'; $rest = $` . $'; &parse_report_number; }; $rest =~ m/^In\s+/i && do { $rest = $'; $type ='@InCollection'; # maybe, proceedings $rest =~ m/\{\\em\s+([\w ,':!?]+)\}[., ]*/ && do { $booktitle = $1; $rest = $` . $'; $type = '@InProceedings' if $booktitle =~ m/proceedings/i; }; $rest =~ m/([\w., ]+)\s+\(eds?\.\)[,.]?\s*/i && do { $editor = $1; $rest = $` . $'; }; $rest =~ m/Chapter\s+(\d+)[.,]?\s*/i && do { $chapter = $1; $rest = $` . $'; }; &parse_edition; &parse_publish; }; $rest =~ m/proceedings/i && do { $type = '@InProceedings'; # print "$items[$i]\n"; }; } $rest =~ s/^\s+//; $rest =~ s/$date([,]?)//; $title = &case_title($title); $booktitle = &case_title($booktitle) if $booktitle; print "$type\{\t$key"; print ",\n author\t= {$authors}"; print ",\n year\t= $date"; print ",\n title\t= {$title}" if $title; print ",\n booktitle\t= {$booktitle}" if $booktitle; print ",\n edition\t= {$edition}" if $edition; print ",\n editor\t= {$editor}" if $editor; print ",\n publisher\t= {$publisher}" if $publisher; print ",\n address\t= {$address}" if $address; print ",\n journal\t= {$journal}" if $journal; print ",\n volume\t= $volume" if $volume; print ",\n month\t= $month" if $month; print ",\n number\t= $number" if $number; print ",\n pages\t= {$pages}" if $pages; print ",\n chapter\t= {$chapter}" if $chapter; print ",\n note\t= {$rest}" if $rest; print "\n\}\n"; $outitems++; } print STDERR "$outitems items processed\n"; exit; ########################### subroutines ############################# # separate multiple authors with 'and' sub parse_authors { # fix authors field $authors =~ s/\s+$//; $authors =~ s/\\&/and/; $authors =~ s/([A-Z])\.([A-Z])\./$1. $2./g; $authors =~ s/ ([A-Z])\.,\s+(?!Jr.)/ $1. and /g; $authors =~ s/ and and / and /; $authors =~ s/Friendly, M./Friendly, Michael/; local($commas) = ($authors =~ tr/,/,/); if ($commas == 1) { @auth = split(/, /, $authors); $authors = join(' ',reverse(@auth)); } } # extract last names of authors into @auth array sub split_authors { @auth = split(/ and /, $authors); @auth =~ grep(s/, .*$//g,@auth); # strip trailing initials @auth =~ grep(s/^([A-Z]\.\s*)+//g,@auth); # @auth =~ grep(s/^[a-z ]+//,@auth); # strip any name prefixes foreach $i (0..$#auth) { @n = split(/ /, $auth[$i]); $auth[$i] = pop(@n); # strip anything before lastname $auth[$i] =~ s/[^\w]//g; # remove non-word chars } # print STDERR "Authors: $authors --> ", join('|',@auth),"\n" if scalar(@auth) > 1; } sub parse_volume { $rest =~ m/,?\s*\{\\it\s+([A-Za-z \d()]+)\s*\}[.,]?/ && do { $volume = $1; $rest = $`. $'; $volume =~ m/\(([ \w]+)\)/ && do { $volume = $`; $number = $1; }; $volume = "{$volume}" unless $volume =~ m/^\d*$/; $number = "\{$number\}" unless $number =~ m/^\d*$/; }; } # find publisher and address in book/inproceedings items # assume the format is address:publisher sub parse_publish { local($before, $after); $rest =~ m#:\s*([\w-\\/& ]+)[.]?# && do { $publisher = $1; ($before, $after) = ($`, $'); $before =~ m/([\w, ]+)$/ && do { $address = $1; $before = $`; $address =~ s/^\s+//; }; $rest = $before . $after; }; } sub parse_report_number { $rest =~ m/\s*(no\.|number)\s+(\d+)[,.]?\s*/i && do { $number = $2; $rest = $` . $'; } } sub parse_edition { $rest =~ m/\(?($ordinal).*(edition|ed\.)\)?\.?\s*/i && do { $edition = $1; $rest = $` . $'; } } # Initial-caps for all non-ignored words sub case_title { local($t) = @_; local(@words) = split(/\s+/, $t); local($w) =0; local($colon); foreach (@words) { $w++; next if /^[A-Z]/; unless ($colon) {next if $ignore_case{$_}}; s/^([a-z])/\u$1/; $colon = tr/:/:/; } join(' ', @words); } sub generate_key { local($yr, $key); ($yr = $date) =~ s/^1\d//; &split_authors; if (scalar(@auth) < 3) { $key = join('', @auth, ':', $yr); } else { $key = join('', $auth[0], '-etal:', $yr); } # print STDERR "New key: $key (", join('|', @auth), ": $date)\n"; $key; }