#!/bin/sh
# tla-update-ids -- Handle adding arch id-tags to new files, removing
#	deleted explicit ids, and some renames
#
#  Copyright (C) 2003, 2004  Miles Bader <miles@gnu.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# Written by Miles Bader <miles@gnu.org>
#
#-
#   -n, --dry-run            Show what actions would be performed, but do not
#                            actually modify anything.  WARNING: --dry-run does
#                            not correctly descend into new subdirectories; it
#                            will show the new directory as being added but
#                            won't operate on its contents.
#
#   -T, --tagline-rules      Display the rules used for determining file tagline
#                            syntax (in the same format as
#                            {arch}/=tagline-rules), and exit.
#
#       --id-hint-tree=TREE_ROOT
#                            When adding an id-tag to a new file, search
#                            TREE_ROOT for the same file (by name) and use its
#                            id-tag if found.  Multiple --id-hint-tree options
#                            may be specified, in which case they are searched
#                            in the order given.
#
#   -h, --help               Display a help message and exit
#       --help-tagline-rules Display description of the syntax of the
#                            {arch}/=tagline-rules file, and exit.
#
#   -V, --version            Display a release identifier string and exit
# 
# tla-update-ids will find files in the current project tree which are
# missing arch id-tags, and give them id-tags; also it will find explicit
# id-tags for which the corresponding file has been removed, and remove those
# id-tags.
#
# It will also try to discover cases where a file with an explicit id-tag has
# been renamed, and move the explicit id-tag (instead of removing the old
# id-tag, and giving the file a new one); such detection is done by comparing
# the contents of any `new' files with the most recent contents files that
# have been removed, and seeing whether less than 10% of their lines are
# different.
#
# When adding an id, if the project tree's id-tagging-method is `tagline',
# tla-update-ids attempts to identify the file's type, and add an appropriate
# tagline; if the tree's id-tagging-method is `explicit', or a file's type
# cannot be identified, then it adds an explicit id-tag instead.  The rules
# for determining tagline syntax may be customized using an
# {arch}/=tagline-rules file; use the `--help-tagline-rules' option to see
# a description of the syntax of that file.
#
# An exit status of 0 means nothing was done, 1 means some taglines or
# explicit ids were added, and anything else means there was some sort of
# error.

# (---- beginning of hdr.shpp ----)
# hdr.shpp

me=`basename $0`

bindir='/usr/bin'
AWK='/usr/bin/nawk'; export AWK
TLA='tla'; export TLA
SED='/bin/sed'; export SED
UUIDGEN='uuidgen'; export UUIDGEN

# (---- TLA_TOOLS_VERSION defined from ,tla-tools-version ----)
TLA_TOOLS_VERSION='jgoerzen@complete.org--debian/tla-tools--debian--1.0--patch-18
'
# (---- end of TLA_TOOLS_VERSION defined from ,tla-tools-version ----)

TLA_TOOL_PFX="${bindir+$bindir/}"
export TLA_TOOL_PFX

TLA_ESCAPE='yes'

if test "$TLA_ESCAPE" = yes; then
  TLA_UNESCAPED_OPT='--unescaped'
else
  TLA_UNESCAPED_OPT=''
fi

# Some tools get completely confused in stupid ways by non-default
# settings of LANG (like gawk, which fucks up regexp character ranges).
LANG=C; export LANG

# (---- end of hdr.shpp ----)
# (---- beginning of cmd-line.shpp ----)
# cmd-line.shpp -- Command-line helper functions for shell scripts

script="$0"
case "$script" in
  */*) ;;
  *)   script="${TLA_TOOL_PFX}$script";;
esac

usage ()
{
  $SED -n -e '/^\([^#]\|#-* *$\)/{s@.*@Usage: '"$me"' [--help|--version]@p;q;}'	\
         -e '/^# *Usage:/,/^# *$/{s/^# //p;q;}'				\
     < "$script"
}

short_help ()
{
  $SED -n -e '/^\([^#]\|-*# *$\|# *Usage:\)/q'				\
	 -e '/^#!/d;s/^.*-- */# /;s/^#[ 	]*//p'			\
     < "$script" | fmt
}

help_body ()
{
  $SED -n '/^ *$/q;/^#-/,/^[^#]/s/^#\( \|$\)//p' < "$script"
}

help ()
{
  usage
  short_help
  echo ''
  help_body
}

version ()
{
  local no_nl_vers=`echo "$TLA_TOOLS_VERSION"`
  echo "$me (tla-tools) $no_nl_vers"
  $SED -n '/^[^#]/q;/^#-/q;s/^# *\(Written by\)/\
\1/p' < "$script"
  $SED -n '/^[^#]/q;/^#-/q;s/^# *\(Copyright\)/\
\1/p' < "$script"
}

unrec_opt ()
{
  echo 1>&2 "$me: unrecognized option "\`"$1'"
  echo 1>&2 "Try "\`"$me --help' for more information."
}

cmd_line_err ()
{
  usage 1>&2
  echo 1>&2 "Try "\`"$me --help' for more information."
}

long_opt_val ()
{
  echo "$1" | $SED 's/^[^=]*=//'
}

short_opt_val ()
{
  echo "$1" | $SED 's/^-.//'
}

# (---- end of cmd-line.shpp ----)

# (---- TLA_AWK_FUNS defined from tla-tools-funs.awk ----)
TLA_AWK_FUNS='# tla-tools-funs.awk -- AWK functions used by my tla-* shell scripts

function _append_cmd_arg(cmd, arg)
{
  if (arg) {
    gsub (/'\''/, "'\''\\'\'''\''", arg)
    cmd = cmd " '\''" arg "'\''"
  }
  return cmd
}

# Return a shell command string corresponding to CMD with args
# ARG1...ARG4.  CMD is included as-is, so can contain shell
# meta-characters; ARG1...ARG4 are quoted to prevent evaluation by the
# shell, and correctly handle any embedded spaces.
function make_cmd(cmd, arg1, arg2, arg3, arg4)
{
  cmd = _append_cmd_arg(cmd, arg1)
  cmd = _append_cmd_arg(cmd, arg2)
  cmd = _append_cmd_arg(cmd, arg3)
  cmd = _append_cmd_arg(cmd, arg4)
  return cmd
}

# Run CMD with args ARG1...ARG4, return non-zero if successful.
# CMD is passed raw to the shell, so can contain shell meta-characters;
# ARG1...ARG4 are quoted to prevent evaluation by the shell, and 
# correctly handle any embedded spaces.  Returns 1 if the command
# succeeded, and 0 otherwise.
function run_cmd(cmd, arg1, arg2, arg3, arg4)
{
  # print "run_cmd: " make_cmd(cmd, arg1, arg2, arg3, arg4)
  return (system(make_cmd(cmd, arg1, arg2, arg3, arg4)) == 0) ? 1 : 0
}

# Run CMD with args ARG1...ARG4, return the first line of output, or 0
# if the command returned a failure status (or the command could not be
# executed).  CMD is passed raw to the shell, so can contain shell
# meta-characters; ARG1...ARG4 are quoted to prevent evaluation by the
# shell, and correctly handle any embedded spaces.
function run_cmd_first_line(cmd, arg1, arg2, arg3, arg4  ,result)
{
  cmd = make_cmd(cmd, arg1, arg2, arg3, arg4)
  if ((cmd| getline result) <= 0)
    result = 0
  close (cmd)
  # print "run_cmd_first_line: " cmd " => " result
  return result
}

# Return the first line of FILE
function file_first_line(file)
{
  return run_cmd_first_line("sed 1q", file)
}

# Return the last line of FILE
function file_last_line(file)
{
  return run_cmd_first_line("sed -n", "$p", file)
}

# Return the number of lines in FILE
function file_num_lines(file)
{
  return run_cmd_first_line("wc -l <", file) + 0
}

function file_is_dir(file)
{
  return run_cmd("ls -d >/dev/null 2>/dev/null", file "/.")
}

function file_exists(file  ,line,result)
{
  result = (getline line < file)
  close (file)
  return result >= 0
}

# Append TEXT to FILE, with an intervening blank line if LAST_LINE
# isn'\''t blank.  Returns 1 if succesful, and 0 otherwise.
function append_text(file, text, last_line  ,append_cmd)
{
  append_cmd = make_cmd("cat >>", file)
  if (last_line && last_line !~ /^[ \t]*$/)
    print "" |append_cmd
  printf ("%s\n", text) |append_cmd
  return close (append_cmd) == 0
}

function file_explicit_id_dir(file  ,dir)
{
  dir = file
  sub (/\/[^\/]*$/, "", dir)
  sub (/.*\//, "", file)
  return ((dir && dir != file) ? dir "/.arch-ids" : ".arch-ids")
}
function file_explicit_id_file(file  ,dir)
{
  dir = file
  sub (/\/[^\/]*$/, "", dir)
  sub (/.*\//, "", file)
  return ((dir && dir != file) ? dir "/.arch-ids/" : ".arch-ids/") file ".id"
}

function file_from_explicit_id_file(file  ,dir)
{
  sub (/\.id$/, "", file)
  
  dir = file
  sub (/\/[^\/]*$/, "", dir)
  sub (/.*\//, "", file)

  sub (/\.arch-ids$/, "", dir)

  return dir file
}

function file_has_explicit_id(file)
{
  return file_exists(file_explicit_id_file(file))
}

# Returns the id-tag and tagging-method of FILE, in tla "METH_ID" format
# (i.e., explicit ids have "x_" prepended to them, and taglines have "i_").
# FILE may be in a different project tree than the current directory.
# If no id can be found for FILE, 0 is returned instead.
function file_meth_id(file  ,output,parts)
{
  if (! (file in _file_meth_ids)) {
    output = run_cmd_first_line("$TLA id 2>/dev/null", file)
    if (! output)
      return 0

    split (output, parts)
    _file_meth_ids[file] = parts[2]
  }

  return _file_meth_ids[file]
}

# Returns the id-tag of FILE.
# FILE may be in a different project tree than the current directory.
# If no id can be found for FILE, 0 is returned instead.
function file_id(file  ,id)
{
  id = file_meth_id(file)
  if (id)
    sub (/^._/, "", id)
  return id
}

# Return the (absolute) filename corresponding to ID in TREE_ROOT,
# or zero if there is none.  If DIRS_ONLY is true, only directories are
# searched for (which can be slightly faster).
function id_file(id, tree_root, dirs_only  ,level,type_opt,inven_cmd,cmd_status,inven_line,parts)
{
  level = dirs_only ? 1 : 2;

  if (_id_files_tree_level[tree_root] + 0 < level) {
    # We have not searched TREE_ROOT before, or only searched for dirs
    type_opt = (dirs_only ? " --directories" : " --both")

    inven_cmd = make_cmd("$TLA inventory --ids --source 2>/dev/null" type_opt, tree_root)

    while ((cmd_status = (inven_cmd |getline inven_line)) > 0) {
      split (inven_line, parts)

      # Add to _file_meth_ids array since we have the info handy
      _file_meth_ids[parts[1]] = parts[2]

      # Add all entries to _id_files
      sub (/^._/, "", parts[2])
      _id_files[parts[2], tree_root] = parts[1]
    }

    if (cmd_status >= 0)
      close (inven_cmd)

    _id_files_tree_level[tree_root] = level
  }

  return _id_files[id, tree_root]
}

# Return a prefix suitable for prepending to filenames in the current
# directory to make them properly project-tree-root relative, to the
# tree-root TREE_ROOT; if TREE_ROOT is zero (or not given), then the tla
# `tree-root'\'' command is invoked to compute the current tree-root.  If
# the current directory is a tree-root, then the result is the empty
# string.
function tree_root_prefix(tree_root  ,cwd)
{
  if (! tree_root)
    tree_root = run_cmd_first_line("$TLA tree-root 2>/dev/null")
  cwd = run_cmd_first_line("pwd")
  if (cwd != tree_root && substr (cwd, 1, length (tree_root)) == tree_root)
    return substr (cwd, length (tree_root) + 2) "/"
  else
    return ""
}

# Return the path to FILE in a pristine version (either a revision
# library entry or a pristine tree) of the latest revision, or 0 if one
# cannot be found.
function pristine_file(file  ,latest_rev,revlib,revlibs_cmd,revlibs_cmd_status,greedy)
{
  if (! pristine_root) {
    # Find the latest revision and make sure we have a pristine tree for
    # it; by `pristine tree'\'' we really mean revlib entry or pristine tree

    latest_rev = run_cmd_first_line("$TLA logs -f | sed -n '\''$p'\''")

    # See if we'\''ve got a revlib entry handy
    pristine_root = run_cmd_first_line("$TLA library-find --silent", latest_rev)

    if (! pristine_root) {
      # No revlib entry; can we add one to a greedy library?

      # Search for a greedy revision library
      revlibs_cmd = make_cmd("$TLA my-revision-library 2>/dev/null")
      while ((revlibs_cmd_status = (revlibs_cmd |getline revlib)) > 0) {
	greedy = run_cmd_first_line(make_cmd("$TLA library-config", revlib) \
				    "| grep '\''^greedy[?]'\''")
	if (greedy ~ /yes$/)
	  break
      }
      if (revlibs_cmd_status >= 0)
	close (revlibs_cmd)

      if (revlibs_cmd_status > 0) {
	# Found a greedy library, add an entry for this revision to it

	if (run_cmd("$TLA library-add", latest_rev))
	  pristine_root = run_cmd_first_line("$TLA library-find", latest_rev)
      }

      if (! pristine_root) {
	# Give up with revlibs and try to add a pristine tree

	if (run_cmd("$TLA add-pristine", latest_rev))
	  pristine_root = run_cmd_first_line("$TLA find-pristine", latest_rev)
      }
    }
  }

  if (pristine_root)
    return pristine_root "/" file
  else
    return 0
}

# Return a unique ID string
function unique_id() { return run_cmd_first_line("$UUIDGEN") }

# Return the filename FILE with any leading `./'\'' removed
function no_dot(file) { sub (/^\.\//, "", file); return file }

# Returns the (fully-specified) revision REV with the patch-level
# component removed
function revision_version(rev  ,archive,parts,ver)
{
  if (split (rev, parts, "/") == 2) {
    archive = parts[1]
    rev = parts[2]
  } else
    archive = 0
    
  split (rev, parts, "--")

  ver = parts[1] "--" parts[2] "--" parts[3]
  if (archive)
    ver = archive "/" ver

  return ver
}

# Returns the patch-level component of the (fully-specified) revision REV
function revision_patch_level(rev  ,parts)
{
  # Note that the archive component can have embedded -- markers too,
  # but that does not effect the result
  return parts[split (rev, parts, "--")]
}

function patch_log_file_name(rev   ,archive,parts)
{
  split (rev, parts, "/")
  archive = parts[1]
  rev = parts[2]
    
  split (rev, parts, "--")

  return								\
    "{arch}/"								\
    parts[1]								\
    "/" parts[1] "--" parts[2]						\
    "/" parts[1] "--" parts[2] "--" parts[3]				\
    "/" archive								\
    "/patch-log/" parts[4]
}

'
# (---- end of TLA_AWK_FUNS defined from tla-tools-funs.awk ----)
# (---- TAGLINE_AWK_FUNS defined from tagline-funs.awk ----)
TAGLINE_AWK_FUNS='# tagline-funs.awk -- AWK functions used for manipulating arch taglines

BEGIN {
  # Filename to file-type rules

  tagline_type[9, "ext", "am"]			= "automake"
  tagline_type[9, "ext", "ac"]			= "autoconf"
  tagline_type[9, "name", "configure\\.in"]	= "autoconf"

  # Override shell-script recognition
  tagline_type[9, "name", "configure"]		= "explicit"

  # There'\''s no way of inserting an automatically deleted comment in
  # autoconf input files, so they end up clashing with the resulting
  # generated file.  Thus we must use explicit tags (sigh).
  tagline_type[9, "ext", "in"]			= "explicit"

  tagline_type[10, "name", "\\.arch-inventory"] = "sh"
  tagline_type[10, "name", "ChangeLog.*"] 	= "lisp"
  tagline_type[10, "name", "[Mm]akefile.*"] 	= "sh"
  tagline_type[10, "name", "\\.gdbinit.*"] 	= "sh"
  tagline_type[10, "name", "Imakefile"]		= "c" # run through cpp
  tagline_type[10, "name", "texinfo\\.tex"]	= "texi" # texinfo, not tex

  tagline_type[10, "ext", "c"] 		= "c"	 #:  /* arch-tag: ...\n  ... */
  tagline_type[10, "ext", "h"] 		= "c"
  tagline_type[10, "ext", "s"] 		= "c"	 # fed through cpp

  tagline_type[10, "ext", "c\\+\\+"]	= "c++"	 #:  // arch-tag: ...
  tagline_type[10, "ext", "cc"]		= "c++"
  tagline_type[10, "ext", "cxx"]	= "c++"
  tagline_type[10, "ext", "cpp"]	= "c++"
  tagline_type[10, "ext", "C"]		= "c++"
  tagline_type[10, "ext", "CC"]		= "c++"
  tagline_type[10, "ext", "h\\+\\+"]	= "c++"
  tagline_type[10, "ext", "hh"]		= "c++"
  tagline_type[10, "ext", "hxx"]	= "c++"
  tagline_type[10, "ext", "hpp"]	= "c++"
  tagline_type[10, "ext", "H"]		= "c++"
  tagline_type[10, "ext", "HH"]		= "c++"

  tagline_type[10, "ext", "el"] 	= "lisp" #:  ;; arch-tag: ...
  tagline_type[10, "ext", "l"] 		= "lisp"

  tagline_type[10, "ext", "pas"]        = "pascal" #:  (* arch-tag: ...\n  ... *)
  tagline_type[10, "ext", "dpr"]        = "pascal"
  tagline_type[10, "ext", "pp"]         = "pascal"

  tagline_type[10, "ext", "sh"] 	= "sh" 	 #:  # arch-tag: ...
  tagline_type[10, "ext", "bash"] 	= "sh"
  tagline_type[10, "ext", "csh"] 	= "sh"
  tagline_type[10, "ext", "sed"] 	= "sh"
  tagline_type[10, "ext", "awk"] 	= "sh"
  tagline_type[10, "ext", "perl"] 	= "sh"
  tagline_type[10, "ext", "pl"] 	= "sh"	 # perl
  tagline_type[10, "ext", "py"] 	= "sh"	 # python
  tagline_type[10, "ext", "tit"] 	= "sh"	 # by examination
  tagline_type[10, "ext", "inp"] 	= "sh"	 # by examination

  tagline_type[10, "ext", "m4"] 	= "m4"

  tagline_type[10, "ext", "tex"] 	= "tex"  #:  % arch-tag: ...
  tagline_type[10, "ext", "sty"] 	= "tex"
  tagline_type[10, "ext", "erl"] 	= "tex"  # erlang
  tagline_type[10, "ext", "hrl"] 	= "tex"

  tagline_type[10, "ext", "texi"] 	= "texi" #:  @c arch-tag: ...
  tagline_type[10, "ext", "texinfo"] 	= "texi"

  tagline_type[10, "ext", "[1-9]"] 	= "roff" #:  .\" arch-tag: ...

  tagline_type[10, "ext", "html"]	= "html" #:  <!-- arch-tag: ...\n -->
  tagline_type[10, "ext", "xml"] 	= "html"

  tagline_type[10, "ext", "bat"] 	= "bat"  #:  rem arch-tag: ...

  tagline_type[10, "ext", "pov"]	= "c++"	 # povray scene files

  # various script magic numbers all map to "sh"
  tagline_type[10, "header", "#! *[^ ]*/[a-z]*(sh|awk|perl)( .*)?"] = "sh"

  ## File-type tagline conventions

  file_type_tagline["sh"] 	= "# arch-tag: %s"
  file_type_tagline["c"] 	= "/* arch-tag: %s\n   (do not change this comment) */"
  file_type_tagline["c++"] 	= "// arch-tag: %s"
  file_type_tagline["pascal"] 	= "(* arch-tag: %s\n   (do not change this comment) *)"
  file_type_tagline["html"] 	= "<!-- arch-tag: %s\n     (do not change this comment) -->"
  file_type_tagline["lisp"] 	= ";; arch-tag: %s"
  file_type_tagline["null"] 	= "arch-tag: %s"
  file_type_tagline["roff"] 	= ".\\\" arch-tag: %s"
  file_type_tagline["tex"] 	= "%% arch-tag: %s"

  # For the following types, the most natural comment syntaxes run
  # afoul of the rule that arch-tag: must only be preceded by
  # punctuation, so various workarounds are used instead; hopefully
  # this problem will be fixed with the switch to arch-id:.

  #file_type_tagline["m4"]	= "dnl arch-tag: %s"
  file_type_tagline["m4"]	= "ifelse(dnl\tDo not change this comment\n   arch-tag: %s\n)dnl"
  #file_type_tagline["autoconf"] = "dnl arch-tag: %s"
  file_type_tagline["autoconf"]	= "m4_if(dnl\tDo not change this comment\n   arch-tag: %s\n)dnl"
  file_type_tagline["automake"] = "## arch-tag: %s"

  #file_type_tagline["texi"] 	= "@c arch-tag: %s"
  file_type_tagline["texi"] 	= "@ignore\n   arch-tag: %s\n@end ignore"

  #file_type_tagline["bat"] 	= "rem arch-tag: %s"
  file_type_tagline["bat"] 	= "goto skipArchTag\n   arch-tag: %s\n:skipArchTag"

  # Should be a bit loose
  file_type_end_marker_re["lisp"] = ";; .* ends here"
  file_type_end_marker_re["sh"]   = "# .* ends here"
}

function init_tagline_rules(  type,level,kind,regexp,idx,parts,line)
{
  if (! _tagline_rules_initialized) {
    ## Initialization

    # Read project tagline rules file
    #
    # The syntax is:
    #
    #       KIND[.LEVEL]  REGEXP  TYPE
    #  or:  tagline       TYPE    FORMAT
    #  or:  end-marker    TYPE    REGEXP
    #  or:  # COMMENT
    #
    # where KIND is "name" to match whole filenames, "ext" to match file
    # extensions, "header" to match the first line of the file, and "dir" to
    # match directories (in that order of priority).  LEVEL is a priority,
    # where lower levels are search first; default rules are level 9 or 10,
    # and user rules with no explicit priority are level 5.
    #
    # REGEXP is an egrep-style (`extended'\'') regular expression used for
    # matching (within the context of KIND), and a positive match means a
    # tagline of type TYPE is used.
    #
    # FORMAT should contain the actual tagline to add for TYPE, with a
    # single %s where the actual tag value should be substituted (any other
    # occurances of % should be escaped by doubling them percent, e.g %%).
    #
    # Both REGEXP and FORMAT may contain the following special backslash
    # escape sequences: \n \t \s \\  (\s means a space)
    # other occurrences of backslash are left unchanged.
    #
    while ((getline line < "{arch}/=tagline-rules") > 0)
      if (line !~ /^ *(#.*)?$/) {
	split (line, parts)
	if (parts[1] == "tagline") {
	  sub (/^tagline[ \t]*[^ \t]*[ \t]*/, "", line)
	  file_type_tagline[parts[2]] = _file_tagline_unesc(line)
	} else if (parts[1] == "end-marker") {
	  sub (/^end-marker[ \t]*[^ \t]*[ \t]*/, "", line)
	  file_type_end_marker[parts[2]] = _file_tagline_unesc(line)
	} else {
	  level = 5
	  kind = parts[1]
	  regexp = _file_tagline_unesc(parts[2])
	  type = parts[3]
	  if (kind ~ /[.]/) {
	    split (kind, parts, /[.]/)
	    kind = parts[1]
	    level = parts[2] + 0
	  }
	  tagline_type[level, kind, regexp] = type
	}
      }
    close ("{arch}/=tagline-rules")

    _tagline_rules_initialized = 1
  }
}

function _file_tagline_init(  type,level,kind,regexp,idx,parts,line)
{
  if (! _file_tagline_initialized) {
    init_tagline_rules()

    for (idx in tagline_type) {
      split (idx, parts, SUBSEP)
      level = parts[1]
      kind = parts[2]
      regexp = parts[3]

      if (level > _file_tagline_max_level)
	_file_tagline_max_level = level
      _file_tagline_levels[level] = 1

      type = tagline_type[idx]
      if ((level, kind, type) in _file_tagline_type_re) {
	if (kind == "ext")
	  regexp = ".*\\." regexp
	regexp = _file_tagline_type_re[level, kind, type] "|" regexp
      } else if (kind == "ext")
	regexp = "^(.*\\." regexp
      else if (kind == "dir")
	regexp = "(^|/)(" regexp
      else
	regexp = "^(" regexp

      _file_tagline_type_re[level, kind, type] = regexp

      _file_tagline_types[type] = 1
    }

    for (idx in _file_tagline_type_re)
      _file_tagline_type_re[idx] = _file_tagline_type_re[idx] ")$"

    _file_tagline_initialized = 1
  }
}

function _file_tagline_unesc(string)
{
  gsub (/\\\\/, "\\q", string)  # Change \\ to \q to avoid confusion below
  gsub (/\\n/, "\n", string)	# Do substitutions
  gsub (/\\t/, "\t", string)
  gsub (/\\s/, " ", string)
  gsub (/\\q/, "\\", string)	# Finally change \q into plain \
  return string
}

function _file_tagline_find_type(string, level, kind  ,type)
{
  for (type in _file_tagline_types)
    if ((level, kind, type) in _file_tagline_type_re)
      if (string ~ _file_tagline_type_re[level, kind, type])
	return type
  return 0
}

function _file_tagline_find_header_type(file, level  ,type,header)
{

  for (type in _file_tagline_types)
    if ((level, "header", type) in _file_tagline_type_re) {
      if (! header)
	header = file_first_line(file)
      if (! header)
	return 0
      if (header ~ _file_tagline_type_re[level, "header", type])
	return type
    }
  return 0
}

# Return the `tagline type'\'' of a file, which determines what commenting
# conventions to use for adding a tagline, or 0 if no tagline should be used
function file_tagline_type(file  ,base_name,dir,type,level)
{
  _file_tagline_init()

  if (file ~ /\//) {
    dir = file
    sub (/\/[^\/]*$/, "", dir)
    base_name = file
    sub (/.*\//, "", base_name)
  } else {
    dir = "."
    base_name = file
  }

  type = 0
  for (level = 0; level <= _file_tagline_max_level && !type; level++)
    if (level in _file_tagline_levels) {
      type = _file_tagline_find_type(base_name, level, "name")
      if (! type)
	type = _file_tagline_find_type(base_name, level, "ext")
      if (! type)
	type = _file_tagline_find_header_type(file, level)
      if (! type)
	type = _file_tagline_find_type(dir, level, "dir")
    }

  if (type == "explicit")
    type = 0

  return type
}

'
# (---- end of TAGLINE_AWK_FUNS defined from tagline-funs.awk ----)

# List tagline rules.  Optional single arg is the kind of rule to list.
list_tagline_rules()
{
  $AWK '
    '"$TLA_AWK_FUNS"'
    '"$TAGLINE_AWK_FUNS"'

    function esc(str, do_spaces)
    {
      gsub (/\\/, "\\\\", str)
      gsub (/\n/, "\\n", str)
      gsub (/\t/, "\\t", str)
      if (do_spaces)
      gsub (/ /, "\\s", str)
      return str
    }

    function tagline_key_prio(parts  ,prio,kind)
    {
      prio = parts[1] * 16
      kind = parts[2]

      if (kind == "dir")
	return prio + 3
      else if (kind == "header")
	return prio + 2
      else if (kind == "ext")
	return prio + 1
      else
	return prio
    }
    function tagline_keys_lessp(key1, key2  ,parts1,parts2,prio1,prio2)
    {
      split (key1, parts1, SUBSEP)
      split (key2, parts2, SUBSEP)

      prio1 = tagline_key_prio(parts1)
      prio2 = tagline_key_prio(parts2)

      return prio1 < prio2 || (prio1 == prio2 && parts1[3] < parts2[3])
    }
    function sort_tagline_keys(keys, len,  i,j,tmp)
    {
      for (i = 0; i < len; i++)
	for (j = i + 1; j < len; j++) {
	  if (! tagline_keys_lessp(keys[i], keys[j]))
	    {
	      tmp = keys[i]
	      keys[i] = keys[j]
	      keys[j] = tmp
	    }
	}
    }

    function print_sorted_mapping(array, fmt  ,num_els,key,keys,i,j)
    {
      num_els = 0
      for (key in array)
	keys[num_els++] = key

      for (i = 0; i < num_els; i++)
	for (j = i + 1; j < num_els; j++) {
	  if (keys[i] > keys[j])
	    {
	      tmp = keys[i]
	      keys[i] = keys[j]
	      keys[j] = tmp
	    }
	}

      for (i = 0; i < num_els; i++)
	printf (fmt, keys[i], esc(array[keys[i]], 0))
    }

    BEGIN {
      limit = "'"$1"'"
      if (limit == "")
	limit = 0

      init_tagline_rules()

      if (!limit || limit ~ /^(ext|name|dir)$/) {
	print "# Filename to file-type mapping rules (in order of priority)"
	printf ("# %s\t%-20s %s\n", "KIND", "REGEXP", "TYPE")

	num_tagline_type_keys = 0
	for (key in tagline_type)
	  tagline_type_keys[num_tagline_type_keys++] = key
	sort_tagline_keys(tagline_type_keys, num_tagline_type_keys)

	for (i = 0; i < num_tagline_type_keys; i++) {
	  key = tagline_type_keys[i]

	  split (key, parts, SUBSEP)

	  level = parts[1] + 0
	  kind = parts[2]
	  regexp = parts[3]

	  if (!limit || limit == kind)
	    if (level == 10)
	      printf ("%s\t%-20s %s\n",
		      kind, esc(regexp, 1),
		      tagline_type[level, kind, regexp]);
	    else
	      printf ("%s.%d\t%-20s %s\n",
		      kind, level, esc(regexp, 1),
		       tagline_type[level, kind, regexp]);
	}
      }

      if (!limit || limit == "tagline") {
	print ""
	print "# File-type tagline-syntax rules"
	printf ("#       %-8s %s\n", "TYPE", "TAGLINE FORMAT")
	print_sorted_mapping(file_type_tagline, "tagline %-8s %s\n")
      }

      if (!limit || limit == "end-marker") {
	print ""
	print "# File-type end-marker rules"
	printf ("#          %-8s %s\n", "TYPE", "END-MARKER REGEXP")
	print_sorted_mapping(file_type_end_marker_re, "end-marker %-8s %s\n")
      }
    }
  '
}

DRY_RUN=0
NUM_HINT_TREES=0
HINT_TREE_INITS=''

add_hint_tree()
{
  local dq='"'
  HINT_TREE_INITS="$HINT_TREE_INITS hint_tree[$NUM_HINT_TREES] = $dq`echo "$1" | sed 's@"@\\\\"@g'`$dq;"
  NUM_HINT_TREES=`expr $NUM_HINT_TREES + 1`
}

# Parse command-line options
while :; do
  case "$1" in
    --tagline-rules|-T)
      shift
      list_tagline_rules "$@"
      exit 0;;
    -n|--dry-run)
      DRY_RUN=1; shift;;
    --id-hint-tree)
      add_hint_tree "$2"; shift 2;;
    --id-hint-tree=*)
      add_hint_tree "`long_opt_val "$1"`"; shift;;
    --help|-h|-H)
      help; exit 0;;
    --help-tagline-rules)
      cat <<'EOF'
When a new file is added to a source tree which is using the "tagline"
id-tagging method, you usually want a tagline to be added to it (instead of
an explicit id); however as the tagline is actually part of the source
file, it must use a syntax which does not cause problems for the file's
other uses. For instance, to a shell-script or a Makefile, you'd add a line
that looks something like:

   # arch-tag: 8d4a1294-6b5e-4dfd-bbd0-7efc22483b05

tla-update-ids has a fair number of built-in rules for determining what
tagline syntax to use based on the name of the file; these should catch
common types of files.

The built-in rules may also be extended on a per-project basis by using an
{arch}/=tagline-rules file; that file consists of lines, where each line
should have one of the following forms:

   name REGEXP TYPE     Files with names matching REGEXP have file-type TYPE
			(the actual tagline syntax is determined by the
			file-type).  REGEXP is an egrep-style ("extended")
			regular expression.

   ext REGEXP TYPE      Files with extensions matching REGEXP have file-type
			TYPE.

   header REGEXP TYPE   Files whose first-line matches REGEXP have file-type
			TYPE.

   dir REGEXP TYPE      Files in directories matching REGEXP have file-type 
			TYPE.

   name.LEVEL REGEXP TYPE
   ext.LEVEL REGEXP TYPE
   header.LEVEL REGEXP TYPE
   dir.LEVEL REGEXP TYPE
			These are like the corresponding rule without .LEVEL,
			but also specify an explicit priority, LEVEL.  Lower
			levels are search first; default rules are level 9 or
			10, and user rules with no explicit priority are level
			5.

   tagline TYPE FORMAT  Specifies the tagline syntax for the file-type TYPE to
			be FORMAT.  FORMAT should contain the actual tagline
			text for TYPE, with a single %s where the actual tag
			value should be substituted (any other occurrences of
			% should be escaped by doubling them, e.g %%).

   end-marker TYPE REGEXP   For the file-type TYPE, identifies what
			end-of-file lines the tagline should be placed
			before.  Note that most file-types don't specify an
			end-marker.

   # comment 	A comment.

If more than one rule matches a given file, the precedences is determined as
follows:

   1.  Rules with a lower level argument take precedence over those with a
       higher level argument. As built-in rules all have level 10, and user
       rules by default have level 5, this means that user rules should
       usually override built-in rules.

   2.  Within a level, the precedence is the same as the order in the above
       table: name, ext, header, dir.

Both REGEXP and FORMAT may contain the following special backslash escape
sequences: \n \t \s \\ (\s means a space); other occurrences of backslash are
left unchanged.

An example is the {arch}/=tagline-rules file for the tla-tools package itself,
which adds a rule to say that files with the extension .shpp use a "sh"
(shell) tagline syntax:

   ext     shpp    sh

The "sh" file-type results in taglines that look like:

   # arch-tag: ...

To see the list of rules used by tla-update-ids to determine tagline syntax,
use the `--tagline-rules' option; the output is in the same format as
{arch}/=tagline-rules.
EOF
      exit 0;;
    --version|-V)
      version; exit 0;;
    -[!-]?*)
      # split concatenated single-letter options apart
      FIRST="$1"; shift
      set -- `echo $FIRST | $SED 's/-\(.\)\(.*\)/-\1 -\2/'` "$@"
      ;;
    -*)
      unrec_opt "$1"; exit 10;;
    *)
      break;
  esac
done

test "$#" = 0 || { cmd_line_err; exit 10; }

TREE_ROOT=`$TLA tree-root 2>/dev/null` || { echo 1>&2 "$me: Not in an arch project tree"; exit 11; }

ID_TAGGING_METHOD=`$TLA id-tagging-method`

# There's no point if the tree is not using tags at all
test "$ID_TAGGING_METHOD" = names && exit 0

# We use `tla tree-lint' to list files to act on, and tree-lint always
# operates relative to the tree-root.
cd "$TREE_ROOT"

{
  $TLA tree-lint $TLA_UNESCAPED_OPT --untagged-files | $SED 's@^@no_tag:@'
  $TLA tree-lint $TLA_UNESCAPED_OPT --missing-files  | $SED 's@^@no_file:@'
} | $AWK '
'"$TLA_AWK_FUNS"'
'"$TAGLINE_AWK_FUNS"'

BEGIN {
  # program name
  me = "'"$me"'"

  # Renaming heuristic thresholds
  max_closeness = 100
  rename_closeness_threshold = 10

  dry_run = '$DRY_RUN'

  # This script works best with taglines, but try to handle explicit
  # tagging too.
  id_tagging_method = "'"$ID_TAGGING_METHOD"'"

  num_hint_trees = '"$NUM_HINT_TREES"'
  '"$HINT_TREE_INITS"'
}


# Append a tagline for file-type TYPE with the id-tag ID to FILE
# Returns 1 if successful, and 0 otherwise.
function append_tagline(file, type, id  ,tail_cmd,tmp_file,ok)
{
  text = sprintf (file_type_tagline[type], id)

  last_line = file_last_line(file)

  if (last_line								\
      && type in file_type_end_marker_re				\
      && last_line ~ file_type_end_marker_re[type])
  {
    # The file has an "end-marker" comment, and we want to put the
    # tagline before it, so we cannot simply append to the file.

    # Make a temp file containing everything but the last line
    tmp_file = file ".new"
    ok = run_cmd("sed >", tmp_file, "$d", file)

    if (ok) {
      last_line = file_last_line(tmp_file)
      # ... and append to the temp file instead.
      ok = append_text(tmp_file, text, last_line)

      if (ok) {
	# Now add the last line back, and move it to the real file.
	ok = run_cmd("sed -n >>", tmp_file, "$p", file)
	if (ok)
	  ok = run_cmd("mv", tmp_file, file)
	else
	  run_cmd("rm", tmp_file)
      }
    }
  } else
    ok = append_text(file, text, last_line)

  return ok
}

function compute_file_closeness(old, new  ,old_lines,new_lines,diff_lines)
{
  old_lines = file_num_lines(old)
  new_lines = file_num_lines(new)

  if (old_lines == 0 || new_lines == 0)
    # Empty files never compare equal
    return max_closeness

  # Note that we force even binary files to be compared; this should be
  # reasonably safe as we only use the output for counting with grep -c
  diff_lines = run_cmd_first_line(make_cmd("diff --text", old, new)	\
				  " | grep -c \"^[<>]\"")

  # Return the number of difference lines as a percentage of
  # total file lines
  return (diff_lines * 100) / (old_lines + new_lines)
}

# Look in the list of "existing id hint" trees for FILE, and if found,
# return the id-tag used.  Auxiliary info is returned in the array
# ID_INFO_ARRAY (currently: key "id-tagging-method" contains the method
# found ("explicit" or "tagline"), and key "tree-root" contains the root of
# the tree in which the existing id was found).
function find_existing_id(file, id_info  ,dir,i,hint_root,hint_dir,hint_file,methd_id,id)
{
  if (num_hint_trees == 0)
    return 0

  if (file ~ /\//) {
    # FILE is in a subdirectory
    dir = file
    sub (/\/[^\/]*$/, "", dir)
    sub (/^.*\//, "", file)

    dir_id = file_id(dir)
  } else {
    dir = ""
    dir_id = 0
  }

  for (i = 0; i < num_hint_trees && !id; i++) {
    hint_root = hint_tree[i]

    if (dir_id) {
      # Look for DIR in HINT_ROOT; first try the exact same name, as it
      # is likely the dir has the same name.
      hint_dir = hint_root "/" dir

      if (file_id(hint_dir) != dir_id) {
	# Whoops, no.  DIR must have been renamed in HINT_ROOT, we have to
	# look further.
	hint_dir = id_file(dir_id, hint_root, 1)
      }

      if (hint_dir)
	hint_file = hint_dir "/" file
    } else
      # FILE is in the tree root, just look there in HINT_ROOT
      hint_file = hint_root "/" file

    if (hint_file) {
      meth_id = file_meth_id(hint_file)
      if (meth_id) {
	id = meth_id
	sub(/^._/, "", id)
	id_info["id-tagging-method"] = (meth_id ~ /^x_/ ? "explicit" : "tagline")
	id_info["tree-root"] = hint_root
	id_info["file"] = hint_file;
      }
    }
  }

  return id
}

/^no_tag:/ {
  sub (/^no_tag:/, "")
  dests[num_dests++] = $0
  next
}
/^no_file:/ {
  sub (/^no_file:/, "")
  sources[num_sources++] = file_from_explicit_id_file($0)
  next
}

function err_msg(msg, arg1, arg2, arg3)
{
  printf (me ": " msg "\n", arg1, arg2, arg3) |"cat 1>&2"
}
function fatal_err(exit_code, msg, arg1, arg2, arg3)
{
  err_msg(msg, arg1, arg2, arg3)
  exit (exit_code)
}

END {
  ok = 1
  added_dirs = 0

  # Now try to match up adds and deletes of explicitly tagged files if possible
  for (i = 0; i < num_sources && ok; i++) {
    old = sources[i]
    old_pristine = pristine_file(old)

    best_closeness = max_closeness
    if (file_exists(old_pristine)) {
      # We do not really care about deletes, except for files that were
      # explicitly tagged -- in that case, there is the possibility that the
      # file was actually renamed/moved.

      for (j = 0; j < num_dests; j++) {
	new = dests[j];

	closeness = compute_file_closeness(old_pristine, new)
	if (closeness < best_closeness) {
	  best_closeness = closeness
	  best_target_index = j
	}
      }
    }

    if (best_closeness < rename_closeness_threshold) {
      new = dests[best_target_index]

      if (dry_run)
	print "* would rename explicit id: " old " => " new
      else {
	print "* renaming explicit id: " old " => " new
	ok = run_cmd("$TLA move-id", old, new)
      }

      # Remove the target from further consideration
      dests[best_target_index] = dests[--num_dests]
    } else {
      # must be a delete instead
      if (dry_run)
	print "* would remove explicit id: " old
      else {
	print "* removing explicit id: " old
	ok = run_cmd("$TLA delete-id", old)
      }	

      removed_xid_dirs[file_explicit_id_dir(old)] = 1
    }

    something_changed = 1
  }

  # Add ids to any files that were not used in renaming
  for (i = 0; i < num_dests && ok; i++) {
    new = dests[i]

    is_dir = file_is_dir(new)

    # Look for an existing copy of this file in another tree, and if found
    # use its id
    id = find_existing_id(new, existing_id_info)
    if (id) {
      # Existing id found

      # Check to see whether using ID results in a duplicate id
      dup_file = id_file(id, ".", is_dir)

      if (dup_file) {
	err_msg("%s: existing id-tag found in tree %s is a duplicate of %s; ignored", new, existing_id_info["tree-root"], dup_file)
	id = 0
      } else if (existing_id_info["id-tagging-method"] == "explicit") {
        # current file uses an explicit tag, so use one here too
	type = 0
      } else {
	# must be a tagline
	if (id_tagging_method == "tagline" || id_tagging_method == "implicit") {
	  type = file_tagline_type(new)
	  if (! type) {
	    err_msg("%s: existing id-tag found in tree %s uses a tagline, but no tagline-type can be deduced for this file; no id-tag added!", new, existing_id_info["tree-root"])
	    ok = 0
	  }
	} else {
	  err_msg("%s: existing id-tag found in tree %s uses a tagline, which is not compatible with this tree'\''s id-tagging-method (%s); no id-tag added!", new, existing_id_info["tree-root"], id_tagging_method)
	  ok =0
	}
      }

      if (id && ok)
	existing_id_note = " (from " existing_id_info["file"] ")"
    }

    if (! id) {
      # No existing id, make a new one

      id = unique_id()
      if (! id)
	fatal_err(12, "cannot generate unique-id for tagline, aborting...")

      if (!is_dir && id_tagging_method == "tagline")
	type = file_tagline_type(new)
      else
	type = 0

      existing_id_note = ""
    }

    if (ok) {
      if (type) {
	if (dry_run)
	  print "* would add `" type "'\'' tagline" existing_id_note ": " new
	else {
	  print "* adding `" type "'\'' tagline" existing_id_note ": " new
	  ok = append_tagline(new, type, id)
	}
      } else {
	if (dry_run) {
	  print "* would add explicit id" existing_id_note ": " new
	  if (is_dir && !printed_added_dirs_warning) {
	    print "* WARNING: --dry-run does not recurse into new directories"
	    printed_added_dirs_warning = 1
	  }
	} else {
	  print "* adding explicit id" existing_id_note ": " new
	  ok = run_cmd("$TLA add-id --id", id, new)
	  if (ok && is_dir)
	    added_dirs = 1
	}
      }

      something_changed = 1
    }
  }

  if (ok && added_dirs) {
    # Recursively invoke ourselves to deal with the new subdirectories.
    # We use "system" instead of run_cmd so we can look at the
    # command exit-status.  An exit-status of 0 or 1 is OK,
    # anything else means there was an error.
    sub_command_status = system(make_cmd("'"$0"'"))
    ok = (sub_command_status == 0 || sub_command_status == 1)
  }

  if (! dry_run)
    # Try to remove explicit-id directories too
    for (xid_dir in removed_xid_dirs)
      run_cmd("rmdir 2>/dev/null", xid_dir)

  if (! ok)
    exit (2)
  else if (something_changed)
    exit (1)
  else
    exit (0)
}
'

