#! /usr/bin/env perl
#
# Usage: ./urlbst.pl [--eprint] [--doi]
#    [--nohyperlinks] [--inlinelinks] [--hypertex] [--hyperref]
#    [input-file [output-file]]
# If either input-file or output-file is omitted, they are replaced by
# stdin or stdout respectively.
#
# See http://purl.org/nxg/dist/urlbst for documentation
#
# $Id: urlbst.in,v 1.8 2007/03/26 17:21:56 norman Exp $

$version = '0.6';
($progname = $0) =~ s/.*\///;
$mymarker = "% $progname";
$mymarkerend = "% ...$progname to here";
$myurl = 'http://purl.org/nxg/dist/urlbst';


$infile = '-';
$outfile = '-';

$addeprints = 0;     # if true (nonzero) we add support for eprints
$eprintprefix = 'arXiv:';	# make these settable with --eprint? syntax?
$eprinturl = 'http://arxiv.org/abs/';

$adddoiresolver = '0';
$doiprefix = 'doi:';
$doiurl = 'http://dx.doi.org/';

$makehref = 0;
$availablestring = "Available from: ";
$inlinelinks = 0;

$Usage = "$progname [--eprint] [--doi]\n    [--nohyperlinks] [--inlinelinks] [--hypertex] [--hyperref]\n    [--help] [input-file [output-file]]";

while ($#ARGV >= 0) {
    if ($ARGV[0] eq '--eprint') {
	$addeprints = 1;
    } elsif ($ARGV[0] eq '--doi') {
        $adddoiresolver = 1;
    } elsif ($ARGV[0] eq '--nohyperlinks') {
        $makehref = 0;
    } elsif ($ARGV[0] eq '--hypertex') {
	$makehref = 1;
    } elsif ($ARGV[0] eq '--hyperref') {
	$makehref = 2;
    } elsif ($ARGV[0] eq '--inlinelinks') {
        $inlinelinks = 1;
    } elsif ($ARGV[0] eq '--help') {
        print <<EOD;
urlbst version $version
    Usage: $Usage
      --eprint:       include support for `eprint' fields
      --doi:          include support for `doi' field
      --nohyperlinks  do not include active links anywhere
      --inlinelinks   add hyperlinks to entry titles
      --hypertex:     include HyperTeX-style hyperlink support
      --hyperref:     include {hyperref}-style hyperlink support
                      (generally better)
      --help:         print this help
    Input and output files may be given as `-' (default) to indicate stdin/out
EOD
        exit(0);
    } elsif ($ARGV[0] =~ /^-/) {
        die "Unrecognised option $ARGV[0]: Usage: $Usage\n";
    } elsif ($infile eq '-') {
	$infile = $ARGV[0];
    } elsif ($outfile eq '-') {
	$outfile = $ARGV[0];
    } else {
	die "Usage: $Usage\n";
    }
    shift(@ARGV);
}

if ($inlinelinks && $makehref == 0) {
    print <<'EOD';
Warning: --inlinelinks and --nohyperlinks were both specified (possibly
    implicitly).  That combination makes no sense, so I'll ignore
    --nohyperlinks and use --hyperref instead
EOD
    $makehref = 2;
}

$exitstatus = 0;		# success status

open (IN, "<$infile") || die "Can't open $infile to read";
open (OUT, ">$outfile") || die "Can't open $outfile to write";

# We have to make certain assumptions about the source files, in order
# to patch them at the correct places.  Specifically, we assume that
#
#   - there's a function init.state.consts
#
#   - ...and an output.nonnull which does the actual outputting, which
#         has the `usual' interface.
#
#   - we can replace
#           fin.entry
#       by
#           new.block
#           output.url    % the function which formats and displays any URL
#           fin.entry
#
#   - there is a function which handles the `article' entry type (this
#         will always be true)
#
#   - there is a function output.bibitem which is called at the
#         beginning of each entry type
#   - ...and one called fin.entry which is called at the end
#
# If the functions format.date, format.title or new.block are not defined (the
# former is not in apalike, for example, and the last is not in the
# AMS styles), then replacements are included in the output.
#
# All these assumptions are true of the standard files and, since most
# style files derive from them more or less directly, are true of most (?)
# other style files, too.
#
# There's some rather ugly Perl down here.  The parsing for
# brace-matching could probably do with being rewritten in places, to
# make it less ugly, and more robust.

print OUT "%%% Modification of BibTeX style file ", ($infile eq '-' ? '<stdin>' : $infile), "\n";
print OUT "%%% ... by $progname, version $version (marked with \"$mymarker\")\n%%% See <$myurl>\n";
print OUT "%%% Added webpage entry type, and url and lastchecked fields.\n";
print OUT "%%% Added eprint support.\n"   if ($addeprints);
print OUT "%%% Added DOI support.\n"      if ($adddoiresolver);
print OUT "%%% Added HyperTeX support.\n" if ($makehref == 1);
print OUT "%%% Added hyperref support.\n" if ($makehref == 2);
print OUT "%%% Original headers follow...\n\n";

$found{initconsts} = 0;
$found{outputnonnull} = 0;
$found{article} = 0;
$found{outputbibitem} = 0;
$found{finentry} = 0;
$found{formatdate} = 0;
$found{formattitle} = 0;
$found{newblock} = 0;

while (<IN>) {
    /^ *%/ && do {
        # Pass commented lines unchanged
        print OUT;
        next;
    };
    
    /^ *ENTRY/ && do {
	# Work through the list of entry types, finding what ones are there.
	# If we find a URL entry there already, object, since these edits
	# will mess things up.
	$line = $_;
	until ($line =~ /\{\s*(\w*)/) {
	    $line .= <IN>;
	}
	$bracematchtotal = 0;	# reset
	bracematcher($line);
	$line =~ /\{\s*(\w*)/;
	$found{'entry'.$1} = 1;
	print OUT $line;
	$line = <IN>;
	until (bracematcher($line) == 0) {
	    # XXX deal with multiple entries on one line
	    ($line =~ /^\s*(\w*)/) && ($found{'entry'.$1} = 1);
	    print OUT $line;
	    $line = <IN>;
	}
	if (defined($found{entryurl})) {
	    print STDERR "$progname: style file $infile already has URL entry!\n";
	    # print out the rest of the file, and give up
            print OUT $line;
	    while (<IN>) {
		print OUT;
	    }
	    $exitstatus = 1;
	    last;
	} else {
            print OUT "    eprint $mymarker\n    doi $mymarker\n    url $mymarker\n    lastchecked $mymarker\n";
	}
	print OUT $line;
	next;
    };

    /^ *FUNCTION *\{init\.state\.consts\}/ && do {
        # In the init.state.consts function, add an extra set of
        # constants at the beginning.  Also use this as the marker for
        # the place to add the init strings function.
        print OUT <<EOD;
STRINGS { urlintro eprinturl eprintprefix doiprefix doiurl openinlinelink closeinlinelink } $mymarker...
INTEGERS { hrefform inlinelinks makeinlinelink addeprints adddoiresolver }
% Following constants may be adjusted by hand, if desired
FUNCTION {init.urlbst.variables}
{
  "$availablestring" 'urlintro := % prefix before URL
  "$eprinturl" 'eprinturl := % prefix to make URL from eprint ref
  "$eprintprefix" 'eprintprefix := % text prefix printed before eprint ref
  "$doiurl" 'doiurl := % prefix to make URL from DOI
  "$doiprefix" 'doiprefix :=      % text prefix printed before DOI ref
  #$addeprints 'addeprints :=         % 0=no eprints; 1=include eprints
  #$adddoiresolver 'adddoiresolver :=     % 0=no DOI resolver; 1=include it
  #$makehref 'hrefform :=           % 0=no crossrefs; 1=hypertex xrefs; 2=hyperref refs
  #$inlinelinks 'inlinelinks :=        % 0=URLs explicit; 1=URLs attached to titles
  % the following are internal state variables, not config constants
  #0 'makeinlinelink :=     % state variable managed by setup.inlinelink
  "" 'openinlinelink :=     % ditto
  "" 'closeinlinelink :=    % ditto
}
INTEGERS { 
  bracket.state
  outside.brackets
  open.brackets
  within.brackets
  close.brackets
}
$mymarkerend
EOD
	$line = $_;
	until ($line =~ /\{.*\}.*\{/s) {
	    $line .= <IN>;
	}
	$line =~ s/(\{.*?\}.*?\{)/$1 #0 'outside.brackets := $mymarker
  #1 'open.brackets :=
  #2 'within.brackets :=
  #3 'close.brackets :=

 /s;
	print OUT $line;
	$found{initconsts} = 1;
	next;
    };

    /^ *EXECUTE *\{init\.state\.consts\}/ && do {
        print OUT "EXECUTE {init.urlbst.variables}\n";
        print OUT;
        next;
    };

    /^ *FUNCTION *\{new.block\}/ && do {
        $found{newblock} = 1;
    };

    /^ *FUNCTION *{output\.nonnull}/ && do {
	print OUT "$mymarker\n";
	print OUT "FUNCTION {output.nonnull.original}\n";
        copy_block();
	print_output_functions();
	$found{outputnonnull} = 1;
	next;
    };

    /FUNCTION *\{fin.entry\}/ && do {
        # Rename fin.entry to fin.entry.original (wrapped below)
        s/fin.entry/fin.entry.original/;
        $found{finentry} = 1;
        print OUT;
        next;
    };

    /^ *FUNCTION *{format\.date}/ && do {
	$found{formatdate} = 1;
	print OUT;
	next;
    };

    /^ *FUNCTION *{format\.title}/ && do {
        # record that we found this
	$found{formattitle} = 1;
        print OUT;
	next;
    };

    /^ *format\.b?title/ && do {
        # interpolate a call to setup.inlinelink
        print OUT "  title empty\$ 'skip\$ 'setup\.inlinelink if\$ $mymarker\n";
        print OUT;
        next;
    };

    /^ *FUNCTION *\{article\}/ && do {

	print_missing_functions();
	print_webpage_def();

	print OUT;
	$found{article} = 1;
	next;
    };

    /FUNCTION *\{output.bibitem\}/ && do {
        # Rename output.bibitem to output.bibitem.original (wrapped below)
        s/{output.bibitem\}/\{output.bibitem.original\}/;
 	$found{outputbibitem} = 1;
        print OUT;
        next;
    };

    print OUT;
};

if ($exitstatus == 0) {
    # Skip this if we've already reported an error -- it'll only be confusing
    foreach $k (keys %found) {
        if ($found{$k} == 0) {
            print STDERR "$progname: $infile: failed to find feature $k\n";
        }
    }
}

close (IN);
close (OUT);

exit $exitstatus;;






sub print_output_functions {
    print OUT "$mymarker...\n";

    print OUT <<'EOD';
% The following three functions are for handling inlinelink.  They wrap
% a block of text which is potentially output with write$ by multiple
% other functions, so we don't know the content a priori.
% They communicate between each other using the variables makeinlinelink
% (which is true if a link should be made), and closeinlinelink (which holds
% the string which should close any current link.  They can be called
% at any time, but start.inlinelink will be a no-op unless something has
% previously set makeinlinelink true, and the two ...end.inlinelink functions
% will only do their stuff if start.inlinelink has previously set
% closeinlinelink to be non-empty.
FUNCTION {setup.inlinelink}
{ makeinlinelink
    { hrefform #1 = % hypertex
        { "\special {html:<a href=" quote$ * url * quote$ * "> }{" * 'openinlinelink :=
          "\special {html:</a>}" 'closeinlinelink :=
          }
        { hrefform #2 = % hyperref
            { "\href{" url * "}{" * 'openinlinelink :=
              "}" 'closeinlinelink :=
              }
            'skip$
          if$ % hrefform #2 =
        }
      if$ % hrefform #1 =
      #0 'makeinlinelink :=
    }
    'skip$
 if$ % makeinlinelink
}
FUNCTION {add.inlinelink}
{ openinlinelink empty$
    'skip$
    { openinlinelink swap$ * closeinlinelink *
      "" 'openinlinelink :=
      }
  if$
}
EOD

    # new.block is defined elsewhere

    print OUT <<'EOD';
FUNCTION {output.nonnull}
{ % Save the thing we've been asked to output
  's :=
  % If the bracket-state is close.brackets, then add a close-bracket to
  % what is currently at the top of the stack, and set bracket.state
  % to outside.brackets
  bracket.state close.brackets =
    { "]" *
      outside.brackets 'bracket.state :=
    }
    'skip$
  if$
  bracket.state outside.brackets =
    { % We're outside all brackets -- this is the normal situation.
      % Write out what's currently at the top of the stack, using the
      % original output.nonnull function.
      s
      add.inlinelink
      output.nonnull.original % invoke the original output.nonnull
    }
    { % Still in brackets.  Add open-bracket or (continuation) comma, add the
      % new text (in s) to the top of the stack, and move to the close-brackets
      % state, ready for next time (unless inbrackets resets it).  If we come
      % into this branch, then output.state is carefully undisturbed.
      bracket.state open.brackets =
        { " [" * }
        { ", " * } % bracket.state will be within.brackets
      if$ 
      s * 
      close.brackets 'bracket.state :=
    }
  if$
}

% Call this function just before adding something which should be presented in 
% brackets.  bracket.state is handled specially within output.nonnull.
FUNCTION {inbrackets}
{ bracket.state close.brackets =
    { within.brackets 'bracket.state := } % reset the state: not open nor closed
    { open.brackets 'bracket.state := }
  if$
}

FUNCTION {format.lastchecked}
{ lastchecked empty$
    { "" }
    { inbrackets "cited " lastchecked * }
  if$
}
EOD
    print OUT "$mymarkerend\n";
}

sub print_webpage_def {
    print OUT "$mymarker...\n";

    # Some of the functions below call new.block, so we need a dummy
    # version, in the case where the style being edited doesn't supply
    # that function.
    if (! $found{newblock}) {
        print OUT "FUNCTION {new.block} % dummy new.block function\n{\n  % empty\n}\n\n";
        $found{newblock} = 1;
    }

    print OUT <<'EOD';
% Functions for making hypertext links.
% In all cases, the stack has (link-text href-url)
%
% make 'null' specials
FUNCTION {make.href.null}
{
  pop$
}
% make hypertex specials
FUNCTION {make.href.hypertex}
{ 
  "\special {html:<a href=" quote$ *
  swap$ * quote$ * "> }" * swap$ *
  "\special {html:</a>}" *
}
% make hyperref specials
FUNCTION {make.href.hyperref}
{ 
  "\href {" swap$ * "} {\path{" * swap$ * "}}" *
}
FUNCTION {make.href}
{ hrefform #2 =
    'make.href.hyperref      % hrefform = 2
    { hrefform #1 =
        'make.href.hypertex  % hrefform = 1
        'make.href.null      % hrefform = 0 (or anything else)
      if$
    }
  if$
}

% If inlinelinks is true, then format.url should be a no-op, since it's
% (a) redundant, and (b) could end up as a link-within-a-link.
FUNCTION {format.url}
{ inlinelinks #1 = url empty$ or
   { "" }
   { hrefform #1 =
       { % special case -- add HyperTeX specials
         urlintro "\url{" url * "}" * url make.href.hypertex * }
       { urlintro "\url{" * url * "}" * }
     if$
   }
  if$
}

FUNCTION {format.eprint}
{ eprint empty$
    { "" }
    { eprintprefix eprint * eprinturl eprint * make.href }
  if$
}

FUNCTION {format.doi}
{ doi empty$
    { "" }
    { doiprefix doi * doiurl doi * make.href }
  if$
}

% Output a URL.  We can't use the more normal idiom (something like
% `format.url output'), because the `inbrackets' within
% format.lastchecked applies to everything between calls to `output',
% so that `format.url format.lastchecked * output' ends up with both
% the URL and the lastchecked in brackets.
FUNCTION {output.url}
{ url empty$
    'skip$ 
    { new.block 
      format.url output
      format.lastchecked output 
    }
  if$
}

FUNCTION {output.web.refs}
{
  new.block
  output.url
  addeprints eprint empty$ not and
    { format.eprint output.nonnull }
    'skip$
  if$
  adddoiresolver doi empty$ not and
    { format.doi output.nonnull }
    'skip$
  if$
}

% Wrapper for output.bibitem.original.
% If the URL field is not empty, set makeinlinelink to be true,
% so that an inline link will be started at the next opportunity
FUNCTION {output.bibitem}
{ outside.brackets 'bracket.state :=
  output.bibitem.original
  inlinelinks url empty$ not and
    { #1 'makeinlinelink := }
    { #0 'makeinlinelink := }
  if$
}

% Wrapper for fin.entry.original
FUNCTION {fin.entry}
{ output.web.refs  % urlbst
  makeinlinelink       % ooops, it appears we didn't have a title for inlinelink
    { setup.inlinelink % add some artificial link text here, as a fallback
      "[link]" output.nonnull }
    'skip$
  if$
  bracket.state close.brackets = % urlbst
    { "]" * }
    'skip$
  if$
  fin.entry.original
}

% Webpage entry type.
% Title and url fields required;
% author, note, year, month, and lastchecked fields optional
% See references 
%   ISO 690-2 http://www.nlc-bnc.ca/iso/tc46sc9/standard/690-2e.htm
%   http://www.classroom.net/classroom/CitingNetResources.html
%   http://neal.ctstateu.edu/history/cite.html
%   http://www.cas.usf.edu/english/walker/mla.html
% for citation formats for web pages.
FUNCTION {webpage}
{ output.bibitem
  author empty$
    { editor empty$
        'skip$  % author and editor both optional
        { format.editors output.nonnull }
      if$
    }
    { editor empty$
        { format.authors output.nonnull }
        { "can't use both author and editor fields in " cite$ * warning$ }
      if$
    }
  if$
  new.block
  title empty$ 'skip$ 'setup.inlinelink if$
  format.title "title" output.check
  inbrackets "online" output
  new.block
  year empty$
    'skip$
    { format.date "year" output.check }
  if$
  % We don't need to output the URL details ('lastchecked' and 'url'),
  % because fin.entry does that for us, using output.web.refs.  The only
  % reason we would want to put them here is if we were to decide that
  % they should go in front of the rather miscellaneous information in 'note'.
  new.block
  note output
  fin.entry
}
EOD

    print OUT "$mymarkerend\n\n\n";
}


sub print_missing_functions {
    # We've got to the bit of the file which handles the entry
    # types, so write out the webpage entry handler.  This uses
    # the format.date function, which which many but not all
    # bst files have (for example, apalike doesn't).  So
    # check that we either have found this function already, or
    # add it.
    if (! $found{formatdate}) {
	if ($found{entrymonth}) {
	    print OUT <<'EOD';
FUNCTION {format.date}
{ year empty$
    { month empty$
	{ "" }
	{ "there's a month but no year in " cite$ * warning$
	  month
	}
      if$
    }
    { month empty$
	'year
	{ month " " * year * }
      if$
    }
  if$
}
EOD
	} else {
	    print OUT <<'EOD';
FUNCTION {format.date}
{ year empty$
    'skip$
    { %write$
      "(" year * ")" *
    }
  if$
}
EOD
  	}
	$found{formatdate} = 1;
    }

    # If the style file didn't supply a format.title function, then supply
    # one here (the {webpage} function requires it).
    if (! $found{formattitle}) {
	print OUT <<'EOD';
FUNCTION {format.title}
{ title empty$
    { "" }
    { title "t" change.case$ }
  if$
}
EOD
    $found{formattitle} = 1;
    }
}

# Utility function: Keep track of open and close braces in the string argument.
# Keep state in $bracematchtotal, return the current value.
sub bracematcher {
    my $s = shift;
    $s =~ s/[^\{\}]//g;
    #print "s=$s\n";
    foreach my $c (split (//, $s)) {
	$bracematchtotal += ($c eq '{' ? 1 : -1);
    }
    return $bracematchtotal;
}

# Utility function: use bracematcher to copy the complete block which starts
# on or after the current line.
sub copy_block {
    $bracematchtotal = 0;
    # copy any leading lines which don't have braces (presumably comments)
    while (defined ($line = <IN>) && ($line !~ /{/)) {
        print OUT $line;
    }
    while (defined ($line) && bracematcher($line) > 0) {
        print OUT $line;
        $line = <IN>;
    }
    print OUT "$line\n";	# print out terminating \} (assumed
    				# alone on the line)
}