#!/bin/bash
#
# Usage: ./convert public/data/SomeDir/*.rtf
#
# Requires Apache Tika, unrtf and par. (Tika sucks at RTF->text.)
# On Mac OS X, `brew install tika unrtf par`

set -e
shopt -s nocasematch

tempfile=`mktemp -t convertXXXXXX`
trap "rm -f $tempfile" EXIT

for src in "$@" ; do

  if [ ! -f "$src" ]; then
    echo "$src doesn't exist, trying /data/$src"
    src="/data/$src"
  fi
  if [ ! -e "$src" ]; then
    echo "Can't find file: $src"
    continue
  fi

  dest="${src%%.*}.txt"
  if [ -e "$dest" ]; then
    echo "Skipping already converted file: $dest"
    continue
  fi

  echo "Creating $dest..."
  if [ "${src##*.}" == "rtf" ]; then
    unrtf --text "$src" | iconv -t utf-8 -c | par > "$dest"
    echo "OK"
  else
    (
      echo '---'
      $(dirname $0)/tika -m "$src"
      echo '---'
      echo
      # Tika parses paragraphs properly, but when converting to text the paragraphs are lost. Make
      # sure to insert extra newlines before we run it through par to preserve the paragraphs.
      $(dirname $0)/tika -t "$src" | sed -e 's/$/\n/g' | par
    ) >"$tempfile"
    $(dirname $0)/fixup "$tempfile" >"$dest"
    echo "OK"
    echo -e "\t$( cat "$dest" | grep -Ei '^dc:title' | cut -d\  -f2- )"
  fi

done
