#!/bin/bash # An incomplete script, more like notes really, for grabbing a # copy of wiki.bash-hackers.org from archive.org # This works by targeting pages that have been captured by the Wayback Machine # that specifically have '?do=edit' on the end of their URL. # These pages present the original Dokuwiki Markup source of the respective page. # So with a little massaging, we should be able to extract said Dokuwiki Markup. # Ensure we have our required commands, otherwise fail early cmd_err=0 for cmd in grep sed tr sort uniq mkdir curl jq pandoc; do if ! command -v "${cmd}" >/dev/null 2>&1; then printf -- '%s\n' "This script requires ${cmd} but it was not found in PATH." >&2 (( cmd_err++ )) fi done (( cmd_err > 0 )) && exit 1 # Where are we playing on the local file system? basedir="${HOME}/git/wiki.bash-hackers.org" # Prepend a string e.g. # cmd: prepend foo bar # out: foobar # Also has a delimiter option e.g. # cmd: prepend -d ';' foo bar # out: foo;bar prepend() { local _prepend_delimiter case "${1}" in (-d|--delimiter) _prepend_delimiter="${2}" shift 2 ;; esac printf -- '%s\n' "${1}${_prepend_delimiter:-}${2}" } # Call archive.org's 'available' API to see if a site is available # This will either return the URL of the most current snapshot, or null # e.g. # cmd: check_wayback_availability https://wiki.bash-hackers.org/howto/mutex?do=edit # out: http://web.archive.org/web/20220615023742/https://wiki.bash-hackers.org/howto/mutex?do=edit # vs # cmd: check_wayback_availability https://contoso.com/pantsmcgee # out: null check_wayback_availability() { local remote_target remote_target="https://archive.org/wayback/available?url=${1:?No target specified}" curl -s -X GET "${remote_target}" | jq -r '.archived_snapshots.closest.url' } # Download the given target into a local file structure e.g. # http://web.archive.org/web/20220706170849/https://wiki.bash-hackers.org/scripting/bashbehaviour?do=edit # Will download into: "${basedir}/scripting/bashbehaviour" get_wayback_target() { local remote_target target_path target_dirname remote_target="${1:?No target specified}" # Strip out everything after 'bash-hackers.org' and '?do=edit' e.g. # http://web.archive.org/web/20220615023742/https://wiki.bash-hackers.org/howto/mutex?do=edit -> /howto/mutex target_path="$(sed -n 's/.*bash-hackers.org//p' <<< "${remote_target}" | sed -e 's/?do=edit//')" # If we already have it, bounce out if [[ -f "./${target_path}.markup" ]]; then printf -- '%s\n' "./${target_path}.markup appears to already exist. Remove it and re-run this script to force a fresh download." return 0 fi # Get the dirname e.g. /howto/mutex?do=edit -> /howto target_dirname="$(dirname "${target_path}")" # Create the path, ensuring that we strip the leading slash just-in-case e.g. /howto -> howto mkdir -p "${basedir:?FATAL - basedir unset}/${target_dirname/\//}" # Download the remote target to the local path printf -- '%s\n' "Retrieving into ./${target_path}.markup" curl -s -X GET "${remote_target}" | extract_markup - > "./${target_path}.markup" } # We want to pull internal-wiki links out of the markup fragments. These look like # [[path:to:file]] # But can be given friendly names like # [[path:to:file | friendly name]] # The friendly name might have leading spaces around the pipe char or not # We start by grep'ing for '[[' # Then we filter out external links e.g. '[[http://contoso.com]]' # Then we grep out just the link substrings # Then we filter out friendly names (this could be brutally tidied up, it's late and I'm lazy right now) # Then we append "/?do=edit/" # Then we filter out unwanted garbage one last time scrape_targets() { local source_file source_file="${1:?No target specified}" for source_file in "${@}"; do grep "\[\[" "${source_file}" | grep -v "\[\[http" | grep -o "\[\[.*\]\]" | sed -e 's/ | .*\]\]/]]/g' -e 's/| .*\]\]/]]/g' -e 's/|.*\]\]/]]/g' -e 's/\[\[/\//g' -e 's/\]\]/?do=edit/g' | tr ':' '/' | grep -Ev '==|!=|\$|#|/ |nowiki|ftp' done } # Because of the structure of the downloaded files, # we should be able to reliably extract our target Dokuwiki Markup. # First, remove everything between the first line and 'name="sectok"' # Next, remove everything after '' # This should remove everything above and below our desired Dokuwiki Markup # We also take the opportunity to convert some HTML chars extract_markup() { sed -e '1,/name="sectok"/d' -e '/<\/textarea>/,$d' -e 's/>/>/g' -e 's/</ raw_targets # For each scraped target, validate that they're available from archive.org # and in doing so, generate a list of the urls for the latest captures of each while read -r; do check_wayback_availability "${REPLY}" done < raw_targets > waybacktargets # Work through each of the above urls while read -r; do get_wayback_target "${REPLY}" done < waybacktargets # Now we parse through the .markup files and try to generate a fresh list of raw_targets # With extglob, this would be # scrape_targets **/*.markup | sort | uniq while read -r; do scrape_targets "${REPLY}" done < <(find . -name "*.markup") | sort | uniq > fresh_targets while read -r; do prepend "https://wiki.bash-hackers.org" "${REPLY}" done < fresh_targets > raw_targets # And as before, we generate a list of targets and retrieve them while read -r; do check_wayback_availability "${REPLY}" done < raw_targets > waybacktargets while read -r; do get_wayback_target "${REPLY}" done < waybacktargets # Next, we convert from dokuwiki markup to github markdown while read -r; do pandoc --from dokuwiki --to gfm --toc --no-highlight "${REPLY}" > "${REPLY/.markup/}.md" done < <(find . -name "*.markup") ) # Ugh, screw the constant while read loops. Let's extglob from here. # Replace sed -i -e 's/\\/\| :shell: /' ./**/*.md # Replace sed -i -e 's/\\/\| :loudspeaker: /' ./**/*.md # Replace sed -i -e 's/\\/\| :memo: /' ./**/*.md # Replace sed -i -e 's/\\/\| :bulb: /' ./**/*.md # Replace sed -i -e 's/\\/\| :warning: /' ./**/*.md # Replace sed -i -e 's/\\<\/note\\>/ |\n| --- |/' ./**/*.md