mirror of
https://github.com/rawiriblundell/wiki.bash-hackers.org
synced 2024-12-25 06:00:41 +01:00
Correct markup/markdown distinction
This commit is contained in:
parent
05a2282bd7
commit
205592abc9
10
README.md
10
README.md
@ -1,16 +1,16 @@
|
|||||||
# wiki.bash-hackers.org
|
# wiki.bash-hackers.org
|
||||||
Extraction of wiki.bash-hackers.org from the Wayback Machine
|
Extraction of wiki.bash-hackers.org from the Wayback Machine
|
||||||
|
|
||||||
This is targeting pages that have been captured by the Wayback Machine that specifically have `'?do=edit'` on the end of their URL. This gives us the markdown source.
|
This is targeting pages that have been captured by the Wayback Machine that specifically have `'?do=edit'` on the end of their URL. This gives us the Dokuwiki Markup source.
|
||||||
|
|
||||||
See the incomplete script "archive_crawler" to see my working.
|
See the incomplete script "archive_crawler" to see my working.
|
||||||
|
|
||||||
- TODO: Markdown linting
|
|
||||||
- TODO: Markdown conversion from Dokuwiki "Markup" to GitHub "Markdown" using pandoc
|
|
||||||
- TODO: Parse the already downloaded files for any missing links
|
- TODO: Parse the already downloaded files for any missing links
|
||||||
|
- TODO: Markdown conversion from Dokuwiki Markup to GitHub Markdown using pandoc
|
||||||
|
- TODO: Markdown linting
|
||||||
- TODO: Rinse and repeat
|
- TODO: Rinse and repeat
|
||||||
|
|
||||||
## Extracting the markdown
|
## Extracting the Dokuwiki Markup
|
||||||
So the pages that have `'?do-edit'` on the end of their URL appear to have a reliable and predictable structure:
|
So the pages that have `'?do-edit'` on the end of their URL appear to have a reliable and predictable structure:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -31,7 +31,7 @@ So the pages that have `'?do-edit'` on the end of their URL appear to have a rel
|
|||||||
[ LINES BELOW REMOVED FOR BREVITY ]
|
[ LINES BELOW REMOVED FOR BREVITY ]
|
||||||
```
|
```
|
||||||
|
|
||||||
So basically, we remove everything from the first line to the line that contains `name="sectok"`, and then we remove everything after `</textarea>`, and what's left should be the markdown that we want.
|
So basically, we remove everything from the first line to the line that contains `name="sectok"`, and then we remove everything after `</textarea>`, and what's left should be the Dokuwiki Markup that we want.
|
||||||
|
|
||||||
## LICENSE
|
## LICENSE
|
||||||
|
|
||||||
|
@ -4,8 +4,8 @@
|
|||||||
|
|
||||||
# This works by targeting pages that have been captured by the Wayback Machine
|
# This works by targeting pages that have been captured by the Wayback Machine
|
||||||
# that specifically have '?do=edit' on the end of their URL.
|
# that specifically have '?do=edit' on the end of their URL.
|
||||||
# These pages present the original markdown source of the respective page.
|
# These pages present the original Dokuwiki Markup source of the respective page.
|
||||||
# So with a little massaging, we should be able to extract said markdown.
|
# So with a little massaging, we should be able to extract said Dokuwiki Markup.
|
||||||
|
|
||||||
# Where are we playing on the local file system?
|
# Where are we playing on the local file system?
|
||||||
basedir="${HOME}/git/wiki.bash-hackers.org"
|
basedir="${HOME}/git/wiki.bash-hackers.org"
|
||||||
@ -43,7 +43,7 @@ check_wayback_availability() {
|
|||||||
|
|
||||||
# Download the given target into a local file structure e.g.
|
# Download the given target into a local file structure e.g.
|
||||||
# http://web.archive.org/web/20220706170849/https://wiki.bash-hackers.org/scripting/bashbehaviour?do=edit
|
# http://web.archive.org/web/20220706170849/https://wiki.bash-hackers.org/scripting/bashbehaviour?do=edit
|
||||||
# Will download into scripting/bashbehaviour
|
# Will download into: "${basedir}/scripting/bashbehaviour"
|
||||||
get_wayback_target() {
|
get_wayback_target() {
|
||||||
local remote_target target_path target_dirname
|
local remote_target target_path target_dirname
|
||||||
remote_target="${1:?No target specified}"
|
remote_target="${1:?No target specified}"
|
||||||
@ -55,10 +55,10 @@ get_wayback_target() {
|
|||||||
# Create the path, ensuring that we strip the leading slash just-in-case e.g. /howto -> howto
|
# Create the path, ensuring that we strip the leading slash just-in-case e.g. /howto -> howto
|
||||||
mkdir -p "${basedir:?FATAL - basedir unset}/${target_dirname/\//}"
|
mkdir -p "${basedir:?FATAL - basedir unset}/${target_dirname/\//}"
|
||||||
# Download the remote target to the local path
|
# Download the remote target to the local path
|
||||||
curl -s -X GET "${remote_target}" | extract_markdown - > "./${target_path}.md"
|
curl -s -X GET "${remote_target}" | extract_markup - > "./${target_path}.markup"
|
||||||
}
|
}
|
||||||
|
|
||||||
# We want to pull internal-wiki links out of the markdown fragments. These look like
|
# We want to pull internal-wiki links out of the markup fragments. These look like
|
||||||
# [[path:to:file]]
|
# [[path:to:file]]
|
||||||
# But can be given friendly names like
|
# But can be given friendly names like
|
||||||
# [[path:to:file | friendly name]]
|
# [[path:to:file | friendly name]]
|
||||||
@ -79,12 +79,12 @@ scrape_targets() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Because of the structure of the downloaded files,
|
# Because of the structure of the downloaded files,
|
||||||
# we should be able to reliably extract our target markdown.
|
# we should be able to reliably extract our target Dokuwiki Markup.
|
||||||
# First, remove everything between the first line and 'name="sectok"'
|
# First, remove everything between the first line and 'name="sectok"'
|
||||||
# Next, remove everything after '</textarea>'
|
# Next, remove everything after '</textarea>'
|
||||||
# This should remove everything above and below our desired markdown
|
# This should remove everything above and below our desired Dokuwiki Markup
|
||||||
# We also take the opportunity to convert some HTML chars
|
# We also take the opportunity to convert some HTML chars
|
||||||
extract_markdown() {
|
extract_markup() {
|
||||||
sed -e '1,/name="sectok"/d' -e '/<\/textarea>/,$d' -e 's/>/>/g' -e 's/</</g' -e 's/&/\&/g' "${1:-/dev/stdin}"
|
sed -e '1,/name="sectok"/d' -e '/<\/textarea>/,$d' -e 's/>/>/g' -e 's/</</g' -e 's/&/\&/g' "${1:-/dev/stdin}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user