wiki.bash-hackers.org/archive_crawler

#!/bin/bash
# An incomplete script, more like notes really, for grabbing a
# copy of wiki.bash-hackers.org from archive.org

# This works by targeting pages that have been captured by the Wayback Machine 
# that specifically have '?do=edit' on the end of their URL.
# These pages present the original Dokuwiki Markup source of the respective page.
# So with a little massaging, we should be able to extract said Dokuwiki Markup.

# Ensure we have our required commands, otherwise fail early
cmd_err=0
for cmd in grep sed tr sort uniq mkdir curl jq pandoc git; do
    if ! command -v "${cmd}" >/dev/null 2>&1; then
        printf -- '%s\n' "This script requires ${cmd} but it was not found in PATH." >&2
        (( cmd_err++ ))
    fi
done
(( cmd_err > 0 )) && exit 1

# Where are we playing on the local file system?
basedir="$(git rev-parse --show-toplevel)"

# Prepend a string e.g.
# cmd: prepend foo bar
# out: foobar
# Also has a delimiter option e.g.
# cmd: prepend -d ';' foo bar
# out: foo;bar
prepend() {
    local _prepend_delimiter
    case "${1}" in
        (-d|--delimiter)
        _prepend_delimiter="${2}"
        shift 2
        ;;
    esac
    printf -- '%s\n' "${1}${_prepend_delimiter:-}${2}"
}

# Call archive.org's 'available' API to see if a site is available
# This will either return the URL of the most current snapshot, or null
# e.g.
# cmd: check_wayback_availability https://wiki.bash-hackers.org/howto/mutex?do=edit
# out: http://web.archive.org/web/20220615023742/https://wiki.bash-hackers.org/howto/mutex?do=edit
# vs
# cmd: check_wayback_availability https://contoso.com/pantsmcgee
# out: null
check_wayback_availability() {
    local remote_target
    remote_target="https://archive.org/wayback/available?url=${1:?No target specified}"
    curl -s -X GET "${remote_target}" | jq -r '.archived_snapshots.closest.url'
}

# Download the given target into a local file structure e.g.
# http://web.archive.org/web/20220706170849/https://wiki.bash-hackers.org/scripting/bashbehaviour?do=edit
# Will download into: "${basedir}/scripting/bashbehaviour"
get_wayback_target() {
    local remote_target target_path target_dirname
    remote_target="${1:?No target specified}"
    # Strip out everything after 'bash-hackers.org' and '?do=edit' e.g.
    # http://web.archive.org/web/20220615023742/https://wiki.bash-hackers.org/howto/mutex?do=edit -> /howto/mutex
    target_path="$(sed -n 's/.*bash-hackers.org//p' <<< "${remote_target}" | sed -e 's/?do=edit//')"

    # If we already have it, bounce out
    if [[ -f "./${target_path}.markup" ]]; then
        printf -- '%s\n' "./${target_path}.markup appears to already exist.  Remove it and re-run this script to force a fresh download."
        return 0
    fi

    # Get the dirname e.g. /howto/mutex?do=edit -> /howto
    target_dirname="$(dirname "${target_path}")"
    # Create the path, ensuring that we strip the leading slash just-in-case e.g. /howto -> howto
    mkdir -p "${basedir:?FATAL - basedir unset}/${target_dirname/\//}"
    # Download the remote target to the local path
    printf -- '%s\n' "Retrieving into ./${target_path}.markup"
    curl -s -X GET "${remote_target}" | extract_markup - > "./${target_path}.markup"
}

# We want to pull internal-wiki links out of the markup fragments.  These look like
# [[path:to:file]]
# But can be given friendly names like
# [[path:to:file | friendly name]]
# The friendly name might have leading spaces around the pipe char or not
# We start by grep'ing for '[['
# Then we filter out external links e.g. '[[http://contoso.com]]'
# Then we grep out just the link substrings
# Then we filter out friendly names (this could be brutally tidied up, it's late and I'm lazy right now)
# Then we append "/?do=edit/"
# Then we filter out unwanted garbage one last time
scrape_targets() {
    local source_file
    source_file="${1:?No target specified}"
    for source_file in "${@}"; do
        grep "\[\[" "${source_file}" |
            grep -v "\[\[http" |
            grep -o "\[\[.*\]\]" |
            sed -e 's/ | .*\]\]/]]/g' -e 's/| .*\]\]/]]/g' -e 's/|.*\]\]/]]/g' -e 's/\[\[/\//g' -e 's/\]\]/?do=edit/g' |
            tr ':' '/' |
            grep -Ev '==|!=|\$|#|/ |nowiki|ftp'
    done
}

# Because of the structure of the downloaded files,
# we should be able to reliably extract our target Dokuwiki Markup.
# First, remove everything between the first line and 'name="sectok"'
# Next, remove everything after '</textarea>'
# This should remove everything above and below our desired Dokuwiki Markup
# We also take the opportunity to convert some HTML chars
extract_markup() {
    sed -e '1,/name="sectok"/d' \
        -e '/<\/textarea>/,$d' \
        -e 's/&gt;/>/g' \
        -e 's/&lt;/</g' \
        -e 's/&amp;/\&/g' \
        -e 's/&quot;/"/g' "${1:-/dev/stdin}"
}

###### Beyond this point things get a little wishy-washy ######

(
    cd "${basedir}" || exit 1

    # If it's not already here, get the start page
    if [[ ! -f start.markup ]]; then
        get_wayback_target https://web.archive.org/web/20220930131429/https://wiki.bash-hackers.org/start?do=edit
    fi

    # Extract a list of targets from the start page
    while read -r; do
        prepend "https://wiki.bash-hackers.org" "${REPLY}"
    done < <(scrape_targets start.markup) > raw_targets

    # For each scraped target, validate that they're available from archive.org
    # and in doing so, generate a list of the urls for the latest captures of each
    while read -r; do
        check_wayback_availability "${REPLY}"
    done < raw_targets > waybacktargets

    # Work through each of the above urls
    while read -r; do
        get_wayback_target "${REPLY}"
    done < waybacktargets

    # Now we parse through the .markup files and try to generate a fresh list of raw_targets
    # With extglob, this would be
    # scrape_targets **/*.markup | sort | uniq
    while read -r; do
        scrape_targets "${REPLY}"
    done < <(find . -name "*.markup") | sort | uniq > fresh_targets

    while read -r; do
        prepend "https://wiki.bash-hackers.org" "${REPLY}"
    done < fresh_targets > raw_targets

    # And as before, we generate a list of targets and retrieve them
    while read -r; do
        check_wayback_availability "${REPLY}"
    done < raw_targets > waybacktargets

    while read -r; do
        get_wayback_target "${REPLY}"
    done < waybacktargets

    # Next, we convert from dokuwiki markup to github markdown
    while read -r; do
        pandoc --from dokuwiki --to gfm --toc --no-highlight "${REPLY}" > "${REPLY/.markup/}.md"
    done < <(find . -name "*.markup")
)

# Ugh, screw the constant while read loops.  Let's extglob from here.

# Replace <note>
sed -i -e 's/\\<note\\>/\| :shell: /' ./**/*.md

# Replace <note important>
sed -i -e 's/\\<note important\\>/\| :loudspeaker: /' ./**/*.md

# Replace <note info>
sed -i -e 's/\\<note info\\>/\| :memo: /' ./**/*.md

# Replace <note tip>
sed -i -e 's/\\<note tip\\>/\| :bulb: /' ./**/*.md

# Replace <note warning>
sed -i -e 's/\\<note warning\\>/\| :warning: /' ./**/*.md

# Replace </note>
sed -i -e 's/\\<\/note\\>/ |\n| --- |/' ./**/*.md

# Let's correct any internal links so that they point to their new .md homes
mapfile -t mdlist < <(grep -o "\[.*\]\(.*\)" start.md | grep -v http | cut -d '(' -f2 | cut -d ')' -f1 | cut -d '#' -f1 | grep '/' | sort | uniq)

while read -r; do
    for target in "${mdlist[@]}"; do
        sed -i -e "s|${target}|${target}.md|g" "${REPLY}"
    done
done < <(find . -name "*.md")

# Because the pandoc output has a mix of markdown tables and html tables
# the above does not capture links within html tables.
# We're getting to the point that the bulk of this capture is done
# and we're over to manual corrections.  But something like this may help:
# grep -R "href=\"/" * | grep -v ".*/.*\.md"
First crawl of archive.org 2023-04-14 13:44:41 +02:00			`#!/bin/bash`
			`# An incomplete script, more like notes really, for grabbing a`
			`# copy of wiki.bash-hackers.org from archive.org`

Add second pass - raw markdown files 2023-04-15 13:23:49 +02:00			`# This works by targeting pages that have been captured by the Wayback Machine`
			`# that specifically have '?do=edit' on the end of their URL.`
Correct markup/markdown distinction 2023-04-16 02:11:19 +02:00			`# These pages present the original Dokuwiki Markup source of the respective page.`
			`# So with a little massaging, we should be able to extract said Dokuwiki Markup.`
Add second pass - raw markdown files 2023-04-15 13:23:49 +02:00
Completed checks for missed targets 2023-04-16 12:49:07 +02:00			`# Ensure we have our required commands, otherwise fail early`
			`cmd_err=0`
Generate `$basedir` from git repo root 2023-04-26 20:47:28 +02:00			`for cmd in grep sed tr sort uniq mkdir curl jq pandoc git; do`
Completed checks for missed targets 2023-04-16 12:49:07 +02:00			`if ! command -v "${cmd}" >/dev/null 2>&1; then`
			`printf -- '%s\n' "This script requires ${cmd} but it was not found in PATH." >&2`
			`(( cmd_err++ ))`
			`fi`
			`done`
			`(( cmd_err > 0 )) && exit 1`

First crawl of archive.org 2023-04-14 13:44:41 +02:00			`# Where are we playing on the local file system?`
Generate `$basedir` from git repo root 2023-04-26 20:47:28 +02:00			`basedir="$(git rev-parse --show-toplevel)"`
First crawl of archive.org 2023-04-14 13:44:41 +02:00
			`# Prepend a string e.g.`
			`# cmd: prepend foo bar`
			`# out: foobar`
			`# Also has a delimiter option e.g.`
			`# cmd: prepend -d ';' foo bar`
			`# out: foo;bar`
			`prepend() {`
			`local _prepend_delimiter`
			`case "${1}" in`
			`(-d\|--delimiter)`
			`_prepend_delimiter="${2}"`
			`shift 2`
			`;;`
			`esac`
			`printf -- '%s\n' "${1}${_prepend_delimiter:-}${2}"`
			`}`

			`# Call archive.org's 'available' API to see if a site is available`
			`# This will either return the URL of the most current snapshot, or null`
			`# e.g.`
			`# cmd: check_wayback_availability https://wiki.bash-hackers.org/howto/mutex?do=edit`
			`# out: http://web.archive.org/web/20220615023742/https://wiki.bash-hackers.org/howto/mutex?do=edit`
			`# vs`
			`# cmd: check_wayback_availability https://contoso.com/pantsmcgee`
			`# out: null`
			`check_wayback_availability() {`
			`local remote_target`
			`remote_target="https://archive.org/wayback/available?url=${1:?No target specified}"`
			`curl -s -X GET "${remote_target}" \| jq -r '.archived_snapshots.closest.url'`
			`}`

			`# Download the given target into a local file structure e.g.`
			`# http://web.archive.org/web/20220706170849/https://wiki.bash-hackers.org/scripting/bashbehaviour?do=edit`
Correct markup/markdown distinction 2023-04-16 02:11:19 +02:00			`# Will download into: "${basedir}/scripting/bashbehaviour"`
First crawl of archive.org 2023-04-14 13:44:41 +02:00			`get_wayback_target() {`
			`local remote_target target_path target_dirname`
			`remote_target="${1:?No target specified}"`
			`# Strip out everything after 'bash-hackers.org' and '?do=edit' e.g.`
			`# http://web.archive.org/web/20220615023742/https://wiki.bash-hackers.org/howto/mutex?do=edit -> /howto/mutex`
			`target_path="$(sed -n 's/.*bash-hackers.org//p' <<< "${remote_target}" \| sed -e 's/?do=edit//')"`
Completed checks for missed targets 2023-04-16 12:49:07 +02:00
			`# If we already have it, bounce out`
			`if [[ -f "./${target_path}.markup" ]]; then`
			`printf -- '%s\n' "./${target_path}.markup appears to already exist. Remove it and re-run this script to force a fresh download."`
			`return 0`
			`fi`

First crawl of archive.org 2023-04-14 13:44:41 +02:00			`# Get the dirname e.g. /howto/mutex?do=edit -> /howto`
			`target_dirname="$(dirname "${target_path}")"`
			`# Create the path, ensuring that we strip the leading slash just-in-case e.g. /howto -> howto`
			`mkdir -p "${basedir:?FATAL - basedir unset}/${target_dirname/\//}"`
			`# Download the remote target to the local path`
Completed checks for missed targets 2023-04-16 12:49:07 +02:00			`printf -- '%s\n' "Retrieving into ./${target_path}.markup"`
Correct markup/markdown distinction 2023-04-16 02:11:19 +02:00			`curl -s -X GET "${remote_target}" \| extract_markup - > "./${target_path}.markup"`
First crawl of archive.org 2023-04-14 13:44:41 +02:00			`}`

Correct markup/markdown distinction 2023-04-16 02:11:19 +02:00			`# We want to pull internal-wiki links out of the markup fragments. These look like`
First crawl of archive.org 2023-04-14 13:44:41 +02:00			`# [[path:to:file]]`
			`# But can be given friendly names like`
			`# [[path:to:file \| friendly name]]`
			`# The friendly name might have leading spaces around the pipe char or not`
			`# We start by grep'ing for '[['`
			`# Then we filter out external links e.g. '[[http://contoso.com]]'`
			`# Then we grep out just the link substrings`
			`# Then we filter out friendly names (this could be brutally tidied up, it's late and I'm lazy right now)`
			`# Then we append "/?do=edit/"`
Completed checks for missed targets 2023-04-16 12:49:07 +02:00			`# Then we filter out unwanted garbage one last time`
First crawl of archive.org 2023-04-14 13:44:41 +02:00			`scrape_targets() {`
			`local source_file`
			`source_file="${1:?No target specified}"`
Completed checks for missed targets 2023-04-16 12:49:07 +02:00			`for source_file in "${@}"; do`
			`grep "\[\[" "${source_file}" \|`
			`grep -v "\[\[http" \|`
			`grep -o "\[\[.*\]\]" \|`
			`sed -e 's/ \| .\]\]/]]/g' -e 's/\| .\]\]/]]/g' -e 's/\|.*\]\]/]]/g' -e 's/\[\[/\//g' -e 's/\]\]/?do=edit/g' \|`
			`tr ':' '/' \|`
			`grep -Ev '==\|!=\|\$\|#\|/ \|nowiki\|ftp'`
			`done`
First crawl of archive.org 2023-04-14 13:44:41 +02:00			`}`

Add second pass - raw markdown files 2023-04-15 13:23:49 +02:00			`# Because of the structure of the downloaded files,`
Correct markup/markdown distinction 2023-04-16 02:11:19 +02:00			`# we should be able to reliably extract our target Dokuwiki Markup.`
Add second pass - raw markdown files 2023-04-15 13:23:49 +02:00			`# First, remove everything between the first line and 'name="sectok"'`
			`# Next, remove everything after '</textarea>'`
Correct markup/markdown distinction 2023-04-16 02:11:19 +02:00			`# This should remove everything above and below our desired Dokuwiki Markup`
Correct some HTML chars 2023-04-15 13:53:05 +02:00			`# We also take the opportunity to convert some HTML chars`
Correct markup/markdown distinction 2023-04-16 02:11:19 +02:00			`extract_markup() {`
Correct html quote char 2023-04-24 13:27:29 +02:00			`sed -e '1,/name="sectok"/d' \`
			`-e '/<\/textarea>/,$d' \`
			`-e 's/>/>/g' \`
			`-e 's/</</g' \`
			`-e 's/&/\&/g' \`
			`-e 's/"/"/g' "${1:-/dev/stdin}"`
Add second pass - raw markdown files 2023-04-15 13:23:49 +02:00			`}`

			`###### Beyond this point things get a little wishy-washy ######`

First crawl of archive.org 2023-04-14 13:44:41 +02:00			`(`
			`cd "${basedir}" \|\| exit 1`

			`# If it's not already here, get the start page`
Completed checks for missed targets 2023-04-16 12:49:07 +02:00			`if [[ ! -f start.markup ]]; then`
First crawl of archive.org 2023-04-14 13:44:41 +02:00			`get_wayback_target https://web.archive.org/web/20220930131429/https://wiki.bash-hackers.org/start?do=edit`
			`fi`

Add second pass - raw markdown files 2023-04-15 13:23:49 +02:00			`# Extract a list of targets from the start page`
First crawl of archive.org 2023-04-14 13:44:41 +02:00			`while read -r; do`
Add second pass - raw markdown files 2023-04-15 13:23:49 +02:00			`prepend "https://wiki.bash-hackers.org" "${REPLY}"`
Completed checks for missed targets 2023-04-16 12:49:07 +02:00			`done < <(scrape_targets start.markup) > raw_targets`
First crawl of archive.org 2023-04-14 13:44:41 +02:00
Add second pass - raw markdown files 2023-04-15 13:23:49 +02:00			`# For each scraped target, validate that they're available from archive.org`
			`# and in doing so, generate a list of the urls for the latest captures of each`
First crawl of archive.org 2023-04-14 13:44:41 +02:00			`while read -r; do`
			`check_wayback_availability "${REPLY}"`
			`done < raw_targets > waybacktargets`

Add second pass - raw markdown files 2023-04-15 13:23:49 +02:00			`# Work through each of the above urls`
First crawl of archive.org 2023-04-14 13:44:41 +02:00			`while read -r; do`
			`get_wayback_target "${REPLY}"`
			`done < waybacktargets`
Add note about pandoc'ing to archive_crawler 2023-04-16 10:12:12 +02:00
Completed checks for missed targets 2023-04-16 12:49:07 +02:00			`# Now we parse through the .markup files and try to generate a fresh list of raw_targets`
			`# With extglob, this would be`
			`# scrape_targets */.markup \| sort \| uniq`
			`while read -r; do`
			`scrape_targets "${REPLY}"`
Convert note panels 2023-04-17 03:29:50 +02:00			`done < <(find . -name "*.markup") \| sort \| uniq > fresh_targets`

			`while read -r; do`
			`prepend "https://wiki.bash-hackers.org" "${REPLY}"`
			`done < fresh_targets > raw_targets`
Completed checks for missed targets 2023-04-16 12:49:07 +02:00
			`# And as before, we generate a list of targets and retrieve them`
			`while read -r; do`
			`check_wayback_availability "${REPLY}"`
			`done < raw_targets > waybacktargets`

			`while read -r; do`
			`get_wayback_target "${REPLY}"`
			`done < waybacktargets`

			`# Next, we convert from dokuwiki markup to github markdown`
			`while read -r; do`
			`pandoc --from dokuwiki --to gfm --toc --no-highlight "${REPLY}" > "${REPLY/.markup/}.md"`
			`done < <(find . -name "*.markup")`
			`)`
Convert note panels 2023-04-17 03:29:50 +02:00
			`# Ugh, screw the constant while read loops. Let's extglob from here.`

Convert a few more note panels 2023-04-17 03:48:13 +02:00			`# Replace <note>`
			`sed -i -e 's/\\<note\\>/\\| :shell: /' ./*/.md`

Convert note panels 2023-04-17 03:29:50 +02:00			`# Replace <note important>`
			`sed -i -e 's/\\<note important\\>/\\| :loudspeaker: /' ./*/.md`

			`# Replace <note info>`
			`sed -i -e 's/\\<note info\\>/\\| :memo: /' ./*/.md`

			`# Replace <note tip>`
			`sed -i -e 's/\\<note tip\\>/\\| :bulb: /' ./*/.md`

			`# Replace <note warning>`
			`sed -i -e 's/\\<note warning\\>/\\| :warning: /' ./*/.md`

			`# Replace </note>`
			`sed -i -e 's/\\<\/note\\>/ \|\n\| --- \|/' ./*/.md`
Update internal links in start.md to point to their correct locations 2023-04-24 12:53:11 +02:00
			`# Let's correct any internal links so that they point to their new .md homes`
Correct remaining html-bound links 2023-04-28 01:25:39 +02:00			`mapfile -t mdlist < <(grep -o "\[.\]\(.\)" start.md \| grep -v http \| cut -d '(' -f2 \| cut -d ')' -f1 \| cut -d '#' -f1 \| grep '/' \| sort \| uniq)`
Update internal links in start.md to point to their correct locations 2023-04-24 12:53:11 +02:00
Add need_love.md 2023-04-25 13:15:19 +02:00			`while read -r; do`
			`for target in "${mdlist[@]}"; do`
			`sed -i -e "s\|${target}\|${target}.md\|g" "${REPLY}"`
			`done`
			`done < <(find . -name "*.md")`
Correct remaining html-bound links 2023-04-28 01:25:39 +02:00
			`# Because the pandoc output has a mix of markdown tables and html tables`
			`# the above does not capture links within html tables.`
			`# We're getting to the point that the bulk of this capture is done`
			`# and we're over to manual corrections. But something like this may help:`
			`# grep -R "href=\"/" * \| grep -v "./.\.md"`