diff --git a/README.md b/README.md index 8ab9035..f2b984c 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,16 @@ # wiki.bash-hackers.org Extraction of wiki.bash-hackers.org from the Wayback Machine -This is targeting pages that have been captured by the Wayback Machine that specifically have `'?do=edit'` on the end of their URL. This gives us the markdown source. +This is targeting pages that have been captured by the Wayback Machine that specifically have `'?do=edit'` on the end of their URL. This gives us the Dokuwiki Markup source. See the incomplete script "archive_crawler" to see my working. -- TODO: Markdown linting -- TODO: Markdown conversion from Dokuwiki "Markup" to GitHub "Markdown" using pandoc - TODO: Parse the already downloaded files for any missing links +- TODO: Markdown conversion from Dokuwiki Markup to GitHub Markdown using pandoc +- TODO: Markdown linting - TODO: Rinse and repeat -## Extracting the markdown +## Extracting the Dokuwiki Markup So the pages that have `'?do-edit'` on the end of their URL appear to have a reliable and predictable structure: ```bash @@ -31,7 +31,7 @@ So the pages that have `'?do-edit'` on the end of their URL appear to have a rel [ LINES BELOW REMOVED FOR BREVITY ] ``` -So basically, we remove everything from the first line to the line that contains `name="sectok"`, and then we remove everything after ``, and what's left should be the markdown that we want. +So basically, we remove everything from the first line to the line that contains `name="sectok"`, and then we remove everything after ``, and what's left should be the Dokuwiki Markup that we want. ## LICENSE diff --git a/archive_crawler b/archive_crawler index 2ccbc31..d4e0f87 100644 --- a/archive_crawler +++ b/archive_crawler @@ -4,8 +4,8 @@ # This works by targeting pages that have been captured by the Wayback Machine # that specifically have '?do=edit' on the end of their URL. -# These pages present the original markdown source of the respective page. -# So with a little massaging, we should be able to extract said markdown. +# These pages present the original Dokuwiki Markup source of the respective page. +# So with a little massaging, we should be able to extract said Dokuwiki Markup. # Where are we playing on the local file system? basedir="${HOME}/git/wiki.bash-hackers.org" @@ -43,7 +43,7 @@ check_wayback_availability() { # Download the given target into a local file structure e.g. # http://web.archive.org/web/20220706170849/https://wiki.bash-hackers.org/scripting/bashbehaviour?do=edit -# Will download into scripting/bashbehaviour +# Will download into: "${basedir}/scripting/bashbehaviour" get_wayback_target() { local remote_target target_path target_dirname remote_target="${1:?No target specified}" @@ -55,10 +55,10 @@ get_wayback_target() { # Create the path, ensuring that we strip the leading slash just-in-case e.g. /howto -> howto mkdir -p "${basedir:?FATAL - basedir unset}/${target_dirname/\//}" # Download the remote target to the local path - curl -s -X GET "${remote_target}" | extract_markdown - > "./${target_path}.md" + curl -s -X GET "${remote_target}" | extract_markup - > "./${target_path}.markup" } -# We want to pull internal-wiki links out of the markdown fragments. These look like +# We want to pull internal-wiki links out of the markup fragments. These look like # [[path:to:file]] # But can be given friendly names like # [[path:to:file | friendly name]] @@ -79,12 +79,12 @@ scrape_targets() { } # Because of the structure of the downloaded files, -# we should be able to reliably extract our target markdown. +# we should be able to reliably extract our target Dokuwiki Markup. # First, remove everything between the first line and 'name="sectok"' # Next, remove everything after '' -# This should remove everything above and below our desired markdown +# This should remove everything above and below our desired Dokuwiki Markup # We also take the opportunity to convert some HTML chars -extract_markdown() { +extract_markup() { sed -e '1,/name="sectok"/d' -e '/<\/textarea>/,$d' -e 's/>/>/g' -e 's/</