#!/usr/bin/env bash

set -euxo pipefail

_zsh_html_url="http://zsh.sourceforge.net/Doc/zsh_html.tar.gz"
_zsh_doc_tmp_dir="${HOME}/zsh_doc_tmp"
_zsh_html_src_dir="${_zsh_doc_tmp_dir}/zsh_html"
_zsh_md_src_dir="${_zsh_doc_tmp_dir}/zsh_md"
_mdbook_src_dir="${_zsh_doc_tmp_dir}/mdbook_src"

# Ensure zsh_doc_tmp directories exist
for dir in "${_zsh_html_src_dir}" "${_zsh_md_src_dir}" "${_mdbook_src_dir}"; do
  if ! [ -d "$dir" ]; then
    mkdir -p "$dir"
  fi
done

# Download and extract zsh_html.tar.gz to zsh_doc_tmp_dir
wget "${_zsh_html_url}" -O "${_zsh_doc_tmp_dir}/zsh_html.tar.gz"
tar xzf "${_zsh_doc_tmp_dir}/zsh_html.tar.gz" -C "${_zsh_doc_tmp_dir}"

# For some reason, extra files are added to the archive that
# have no content besides aliases to pages that do.
# They are all less than 4k, and they are just clutter for mdbook,
# so we remove them here.
find "${_zsh_html_src_dir}" -name "*.html" -type 'f' -size -4k -delete

# Remove html noise
for file in "${_zsh_html_src_dir}"/*.html; do
  sed -i '/table/d' "$file"
  sed -i '/span/d' "$file"
  sed -i '/valign/d' "$file"
  sed -i '/\[\]{#/d' "$file"
done

# Rename file extensions from html to md, preserving the original file's name
for file in "${_zsh_html_src_dir}"/*.html; do 
  mv -- "$file" "${_zsh_md_src_dir}/$(basename -- "$file" .html).md"
done

# Convert html to md with pandoc
for file in "${_zsh_md_src_dir}"/*.md; do
  pandoc "$file" -f html -t gfm -o "$file";
done

# Move md files to mdbook_src_dir
for file in "${_zsh_md_src_dir}"/*.md; do
  mv "$file" "${_mdbook_src_dir}"/
done

# Generate TOC with doctoc
doctoc "${_mdbook_src_dir}"/*.md

# Generate SUMMARY.md from zsh_toc.html
python3 "${PWD}/generate_summary.py" > "${_mdbook_src_dir}/SUMMARY.md"