diff options
author | Benjamin Franzke <benjaminfranzke@googlemail.com> | 2012-06-28 10:09:28 +0200 |
---|---|---|
committer | Benjamin Franzke <benjaminfranzke@googlemail.com> | 2012-06-28 10:09:28 +0200 |
commit | 36c0f2f4d8aef72776a337eef05bec8cd0360e83 (patch) | |
tree | 0751342b278da2a5a67226b7450f046b50b279de | |
download | bible-fetch-36c0f2f4d8aef72776a337eef05bec8cd0360e83.tar.gz bible-fetch-36c0f2f4d8aef72776a337eef05bec8cd0360e83.tar.bz2 bible-fetch-36c0f2f4d8aef72776a337eef05bec8cd0360e83.zip |
Add scripts to download elberfelder from die-bibel.de
That is download is shell scripts using curl, parse books
and chapters with sed. Then prepare html with sed to be converted
to zefania xml using a xsl stylesheet.
-rw-r--r-- | .gitignore | 5 | ||||
-rwxr-xr-x | concat.sh | 24 | ||||
-rwxr-xr-x | convert.sed | 31 | ||||
-rw-r--r-- | convert.xsl | 88 | ||||
-rwxr-xr-x | download-books.sh | 17 | ||||
-rwxr-xr-x | download-chapters.sh | 17 | ||||
-rwxr-xr-x | download.sh | 4 | ||||
-rwxr-xr-x | gen-bible.sh | 6 | ||||
-rwxr-xr-x | list-chapters.sh | 11 | ||||
-rwxr-xr-x | make-xml.sh | 8 | ||||
-rwxr-xr-x | parse-book.sed | 4 | ||||
-rwxr-xr-x | parse-chapter.sed | 4 |
12 files changed, 219 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cf7201c --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +books/ +chapters/ +book-list +elberfelder2006.zip +elberfelder2006.xml diff --git a/concat.sh b/concat.sh new file mode 100755 index 0000000..e9b203e --- /dev/null +++ b/concat.sh @@ -0,0 +1,24 @@ +#!/bin/sh + +echo '<?xml version="1.0" encoding="utf-8"?>' +echo '<xmlbible type="x-bible" biblename="Elberfelder 2006" status="v">' +echo '<information>' +echo '<title>Elberfelder 2006</title>' +echo '<format>Zefania XML Bible Markup Language</format>' +echo '</information>' +j=1 +cat book-list | while read buch +do + echo "<biblebook bname=\"$buch\" bnumber=\"$j\">" + + i=1 + while [ -e "chapters/$buch/$i" ] + do + cat "chapters/$buch/$i.xml" | sed 1d + i=$((i+1)) + done + + j=$((j+1)) + echo "</biblebook>" +done +echo '</xmlbible>' diff --git a/convert.sed b/convert.sed new file mode 100755 index 0000000..2449d1f --- /dev/null +++ b/convert.sed @@ -0,0 +1,31 @@ +#!/bin/sed -f + +/data-href/s/&/&/g + +# xsltproc --html doesnt understand html5 +s/section/div/g +s/header/h1/g +s/<nav/<div/g +s:</nav:</div:g +s/footer/div/g +s/article/div/g + +# Fix incorrect < and > inside p tags, that is by allowing only +# known tag be surrounded by < and >. +ta +:a +s/<p>\(.*\)<\/p>/\1/ +tfix +b + +:fix +s/</\</g +s/>/\>/g + +s/<span\([^;]*\)>/<span\1>/g +s/<\/span>/<\/span>/g + +s/\<em\>/<em>/g +s/\<\/em\>/<\/em>/g + +s:.*:<p>&</p>: diff --git a/convert.xsl b/convert.xsl new file mode 100644 index 0000000..d8f1d77 --- /dev/null +++ b/convert.xsl @@ -0,0 +1,88 @@ +<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> + <xsl:output method="xml" indent="yes" encoding="UTF-8" media-type="text/xml"/> + + <xsl:template match="h2"> + <caption vref="{following::span[@class='verse']}"> + <xsl:apply-templates mode="copy" /> + </caption> + </xsl:template> + + <xsl:template match="span[@class='fussnote']" mode="copy-with-notes"> + <note n1="x-studynote"> + <xsl:variable name="id"> + <xsl:value-of select="@data-param"/> + </xsl:variable> + <xsl:apply-templates select="//div[@id=$id]/div/p" mode="copy-with-notes"/> + </note> + </xsl:template> + + <xsl:template match="text()" mode="copy-with-notes"> + <xsl:if test="string-length(normalize-space(.)) > 0"> + <xsl:value-of select="." /> + </xsl:if> + </xsl:template> + + <xsl:template match="em" mode="copy-with-notes"> + <style fs="emphasize"> + <xsl:value-of select="." /> + </style> + </xsl:template> + + <xsl:template match="br" mode="copy-with-notes"> + <br art="x-nl" /> + </xsl:template> + +<!-- + <xsl:template match="*" mode="error"> + <xsl:choose> + <xsl:when test="@class='fussnote'"/> + <xsl:when test="@class='verse'"/> + <xsl:when test="@class='chapter'"/> + <xsl:when test="local-name()='em'"/> + <xsl:when test="local-name()='br'"/> + <xsl:otherwise> + <xsl:message terminate="yes"> + <xsl:value-of select="local-name()" /> + <xsl:value-of select="@class" /> + <xsl:apply-templates select="." mode="copy" /> + </xsl:message> + </xsl:otherwise> + </xsl:choose> + </xsl:template> +--> + <xsl:template match="p[span/@class='verse']"> + <vers> + <xsl:attribute name="vnumber"> + <xsl:value-of select="span[@class='verse']"/> + </xsl:attribute> + <xsl:apply-templates select="text()|span[@class='fussnote']|em|br" mode="copy-with-notes" /> + <!--<xsl:apply-templates select="*" mode="error" />--> + </vers> + </xsl:template> + + <xsl:template match="div[@class='annotation']"> + </xsl:template> + + + <xsl:template match="div[@class='markdown']"> + <xsl:apply-templates select="*" /> + </xsl:template> + + + <xsl:template match="/"> + <chapter cnumber="{//span[@class='chapter']}"> + <xsl:apply-templates select="//div[@class='markdown']"/> + </chapter> + </xsl:template> + + <xsl:template match="*" mode="copy"> + <xsl:element name="{local-name()}"> + <xsl:apply-templates mode="copy" select="@*|node()" /> + </xsl:element> + </xsl:template> + <xsl:template match="@*" mode="copy"> + <xsl:attribute name="{local-name()}"> + <xsl:value-of select="." /> + </xsl:attribute> + </xsl:template> +</xsl:stylesheet> diff --git a/download-books.sh b/download-books.sh new file mode 100755 index 0000000..125c29c --- /dev/null +++ b/download-books.sh @@ -0,0 +1,17 @@ +#!/bin/sh + +rm -rf books/ +mkdir books +echo -n > book-list + +curl -s http://www.die-bibel.de/online-bibeln/elberfelder-bibel/bibeltext/ | \ + ./parse-book.sed | \ + while read url + do + read book + book=$(echo $book | sed s:/.*$:: ) + echo $book + echo $book >> book-list + curl $url > "books/$book" + + done diff --git a/download-chapters.sh b/download-chapters.sh new file mode 100755 index 0000000..ba29d8e --- /dev/null +++ b/download-chapters.sh @@ -0,0 +1,17 @@ +#!/bin/sh + +mkdir -p chapters/ + +cat book-list | while read buch +do + mkdir -p "chapters/$buch" + ./parse-chapter.sed "books/$buch" | \ + while read url + do + read number + echo $url + + echo "$buch $number" + curl $url > "chapters/$buch/$number" + done +done diff --git a/download.sh b/download.sh new file mode 100755 index 0000000..0e80054 --- /dev/null +++ b/download.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +./download-books.sh +./download-chapters.sh diff --git a/gen-bible.sh b/gen-bible.sh new file mode 100755 index 0000000..c6e56df --- /dev/null +++ b/gen-bible.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +./make-xml.sh +./concat.sh > elberfelder2006.xml + +zip elberfelder2006.zip elberfelder2006.xml diff --git a/list-chapters.sh b/list-chapters.sh new file mode 100755 index 0000000..0964982 --- /dev/null +++ b/list-chapters.sh @@ -0,0 +1,11 @@ +#!/bin/sh + +cat book-list | while read buch +do + i=1 + while [ -e "chapters/$buch/$i" ] + do + echo "chapters/$buch/$i" + i=$((i+1)) + done +done diff --git a/make-xml.sh b/make-xml.sh new file mode 100755 index 0000000..dace25c --- /dev/null +++ b/make-xml.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +./list-chapters.sh | while read file +do + echo "$file" + ./convert.sed "$file" | \ + xsltproc --encoding utf-8 --html convert.xsl - > "${file}.xml" +done diff --git a/parse-book.sed b/parse-book.sed new file mode 100755 index 0000000..d46191c --- /dev/null +++ b/parse-book.sed @@ -0,0 +1,4 @@ +#!/bin/sed -nf + +/>&[gl]t;<\/a>/d +s:^.*href="\(.*lesen/stelle/[0-9][0-9]*///ch/[^"]*\)">\([^<]*\)</a>.*$:\1\n\2:p diff --git a/parse-chapter.sed b/parse-chapter.sed new file mode 100755 index 0000000..614d2cf --- /dev/null +++ b/parse-chapter.sed @@ -0,0 +1,4 @@ +#!/bin/sed -nf + +/>&[gl]t;<\/a>/d +s:^.*href="\(.*lesen/stelle/[0-9][0-9]*/[0-9][0-9]*/[0-9][0-9]*/ch/[^"]*\)">\([^<]*\)</a>.*$:\1\n\2:p |