mirror of
https://github.com/rawiriblundell/wiki.bash-hackers.org
synced 2024-12-24 13:50:39 +01:00
181 lines
6.8 KiB
Bash
181 lines
6.8 KiB
Bash
#!/bin/bash
|
|
# An incomplete script, more like notes really, for grabbing a
|
|
# copy of wiki.bash-hackers.org from archive.org
|
|
|
|
# This works by targeting pages that have been captured by the Wayback Machine
|
|
# that specifically have '?do=edit' on the end of their URL.
|
|
# These pages present the original Dokuwiki Markup source of the respective page.
|
|
# So with a little massaging, we should be able to extract said Dokuwiki Markup.
|
|
|
|
# Ensure we have our required commands, otherwise fail early
|
|
cmd_err=0
|
|
for cmd in grep sed tr sort uniq mkdir curl jq pandoc; do
|
|
if ! command -v "${cmd}" >/dev/null 2>&1; then
|
|
printf -- '%s\n' "This script requires ${cmd} but it was not found in PATH." >&2
|
|
(( cmd_err++ ))
|
|
fi
|
|
done
|
|
(( cmd_err > 0 )) && exit 1
|
|
|
|
# Where are we playing on the local file system?
|
|
basedir="${HOME}/git/wiki.bash-hackers.org"
|
|
|
|
# Prepend a string e.g.
|
|
# cmd: prepend foo bar
|
|
# out: foobar
|
|
# Also has a delimiter option e.g.
|
|
# cmd: prepend -d ';' foo bar
|
|
# out: foo;bar
|
|
prepend() {
|
|
local _prepend_delimiter
|
|
case "${1}" in
|
|
(-d|--delimiter)
|
|
_prepend_delimiter="${2}"
|
|
shift 2
|
|
;;
|
|
esac
|
|
printf -- '%s\n' "${1}${_prepend_delimiter:-}${2}"
|
|
}
|
|
|
|
# Call archive.org's 'available' API to see if a site is available
|
|
# This will either return the URL of the most current snapshot, or null
|
|
# e.g.
|
|
# cmd: check_wayback_availability https://wiki.bash-hackers.org/howto/mutex?do=edit
|
|
# out: http://web.archive.org/web/20220615023742/https://wiki.bash-hackers.org/howto/mutex?do=edit
|
|
# vs
|
|
# cmd: check_wayback_availability https://contoso.com/pantsmcgee
|
|
# out: null
|
|
check_wayback_availability() {
|
|
local remote_target
|
|
remote_target="https://archive.org/wayback/available?url=${1:?No target specified}"
|
|
curl -s -X GET "${remote_target}" | jq -r '.archived_snapshots.closest.url'
|
|
}
|
|
|
|
# Download the given target into a local file structure e.g.
|
|
# http://web.archive.org/web/20220706170849/https://wiki.bash-hackers.org/scripting/bashbehaviour?do=edit
|
|
# Will download into: "${basedir}/scripting/bashbehaviour"
|
|
get_wayback_target() {
|
|
local remote_target target_path target_dirname
|
|
remote_target="${1:?No target specified}"
|
|
# Strip out everything after 'bash-hackers.org' and '?do=edit' e.g.
|
|
# http://web.archive.org/web/20220615023742/https://wiki.bash-hackers.org/howto/mutex?do=edit -> /howto/mutex
|
|
target_path="$(sed -n 's/.*bash-hackers.org//p' <<< "${remote_target}" | sed -e 's/?do=edit//')"
|
|
|
|
# If we already have it, bounce out
|
|
if [[ -f "./${target_path}.markup" ]]; then
|
|
printf -- '%s\n' "./${target_path}.markup appears to already exist. Remove it and re-run this script to force a fresh download."
|
|
return 0
|
|
fi
|
|
|
|
# Get the dirname e.g. /howto/mutex?do=edit -> /howto
|
|
target_dirname="$(dirname "${target_path}")"
|
|
# Create the path, ensuring that we strip the leading slash just-in-case e.g. /howto -> howto
|
|
mkdir -p "${basedir:?FATAL - basedir unset}/${target_dirname/\//}"
|
|
# Download the remote target to the local path
|
|
printf -- '%s\n' "Retrieving into ./${target_path}.markup"
|
|
curl -s -X GET "${remote_target}" | extract_markup - > "./${target_path}.markup"
|
|
}
|
|
|
|
# We want to pull internal-wiki links out of the markup fragments. These look like
|
|
# [[path:to:file]]
|
|
# But can be given friendly names like
|
|
# [[path:to:file | friendly name]]
|
|
# The friendly name might have leading spaces around the pipe char or not
|
|
# We start by grep'ing for '[['
|
|
# Then we filter out external links e.g. '[[http://contoso.com]]'
|
|
# Then we grep out just the link substrings
|
|
# Then we filter out friendly names (this could be brutally tidied up, it's late and I'm lazy right now)
|
|
# Then we append "/?do=edit/"
|
|
# Then we filter out unwanted garbage one last time
|
|
scrape_targets() {
|
|
local source_file
|
|
source_file="${1:?No target specified}"
|
|
for source_file in "${@}"; do
|
|
grep "\[\[" "${source_file}" |
|
|
grep -v "\[\[http" |
|
|
grep -o "\[\[.*\]\]" |
|
|
sed -e 's/ | .*\]\]/]]/g' -e 's/| .*\]\]/]]/g' -e 's/|.*\]\]/]]/g' -e 's/\[\[/\//g' -e 's/\]\]/?do=edit/g' |
|
|
tr ':' '/' |
|
|
grep -Ev '==|!=|\$|#|/ |nowiki|ftp'
|
|
done
|
|
}
|
|
|
|
# Because of the structure of the downloaded files,
|
|
# we should be able to reliably extract our target Dokuwiki Markup.
|
|
# First, remove everything between the first line and 'name="sectok"'
|
|
# Next, remove everything after '</textarea>'
|
|
# This should remove everything above and below our desired Dokuwiki Markup
|
|
# We also take the opportunity to convert some HTML chars
|
|
extract_markup() {
|
|
sed -e '1,/name="sectok"/d' -e '/<\/textarea>/,$d' -e 's/>/>/g' -e 's/</</g' -e 's/&/\&/g' "${1:-/dev/stdin}"
|
|
}
|
|
|
|
###### Beyond this point things get a little wishy-washy ######
|
|
|
|
(
|
|
cd "${basedir}" || exit 1
|
|
|
|
# If it's not already here, get the start page
|
|
if [[ ! -f start.markup ]]; then
|
|
get_wayback_target https://web.archive.org/web/20220930131429/https://wiki.bash-hackers.org/start?do=edit
|
|
fi
|
|
|
|
# Extract a list of targets from the start page
|
|
while read -r; do
|
|
prepend "https://wiki.bash-hackers.org" "${REPLY}"
|
|
done < <(scrape_targets start.markup) > raw_targets
|
|
|
|
# For each scraped target, validate that they're available from archive.org
|
|
# and in doing so, generate a list of the urls for the latest captures of each
|
|
while read -r; do
|
|
check_wayback_availability "${REPLY}"
|
|
done < raw_targets > waybacktargets
|
|
|
|
# Work through each of the above urls
|
|
while read -r; do
|
|
get_wayback_target "${REPLY}"
|
|
done < waybacktargets
|
|
|
|
# Now we parse through the .markup files and try to generate a fresh list of raw_targets
|
|
# With extglob, this would be
|
|
# scrape_targets **/*.markup | sort | uniq
|
|
while read -r; do
|
|
scrape_targets "${REPLY}"
|
|
done < <(find . -name "*.markup") | sort | uniq > fresh_targets
|
|
|
|
while read -r; do
|
|
prepend "https://wiki.bash-hackers.org" "${REPLY}"
|
|
done < fresh_targets > raw_targets
|
|
|
|
# And as before, we generate a list of targets and retrieve them
|
|
while read -r; do
|
|
check_wayback_availability "${REPLY}"
|
|
done < raw_targets > waybacktargets
|
|
|
|
while read -r; do
|
|
get_wayback_target "${REPLY}"
|
|
done < waybacktargets
|
|
|
|
# Next, we convert from dokuwiki markup to github markdown
|
|
while read -r; do
|
|
pandoc --from dokuwiki --to gfm --toc --no-highlight "${REPLY}" > "${REPLY/.markup/}.md"
|
|
done < <(find . -name "*.markup")
|
|
)
|
|
|
|
# Ugh, screw the constant while read loops. Let's extglob from here.
|
|
|
|
# Replace <note important>
|
|
sed -i -e 's/\\<note important\\>/\| :loudspeaker: /' ./**/*.md
|
|
|
|
# Replace <note info>
|
|
sed -i -e 's/\\<note info\\>/\| :memo: /' ./**/*.md
|
|
|
|
# Replace <note tip>
|
|
sed -i -e 's/\\<note tip\\>/\| :bulb: /' ./**/*.md
|
|
|
|
# Replace <note warning>
|
|
sed -i -e 's/\\<note warning\\>/\| :warning: /' ./**/*.md
|
|
|
|
# Replace </note>
|
|
sed -i -e 's/\\<\/note\\>/ |\n| --- |/' ./**/*.md
|