summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBenjamin Franzke <benjaminfranzke@googlemail.com>2012-06-28 10:09:28 +0200
committerBenjamin Franzke <benjaminfranzke@googlemail.com>2012-06-28 10:09:28 +0200
commit36c0f2f4d8aef72776a337eef05bec8cd0360e83 (patch)
tree0751342b278da2a5a67226b7450f046b50b279de
downloadbible-fetch-36c0f2f4d8aef72776a337eef05bec8cd0360e83.tar.gz
bible-fetch-36c0f2f4d8aef72776a337eef05bec8cd0360e83.tar.bz2
bible-fetch-36c0f2f4d8aef72776a337eef05bec8cd0360e83.zip
Add scripts to download elberfelder from die-bibel.de
That is download is shell scripts using curl, parse books and chapters with sed. Then prepare html with sed to be converted to zefania xml using a xsl stylesheet.
-rw-r--r--.gitignore5
-rwxr-xr-xconcat.sh24
-rwxr-xr-xconvert.sed31
-rw-r--r--convert.xsl88
-rwxr-xr-xdownload-books.sh17
-rwxr-xr-xdownload-chapters.sh17
-rwxr-xr-xdownload.sh4
-rwxr-xr-xgen-bible.sh6
-rwxr-xr-xlist-chapters.sh11
-rwxr-xr-xmake-xml.sh8
-rwxr-xr-xparse-book.sed4
-rwxr-xr-xparse-chapter.sed4
12 files changed, 219 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..cf7201c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+books/
+chapters/
+book-list
+elberfelder2006.zip
+elberfelder2006.xml
diff --git a/concat.sh b/concat.sh
new file mode 100755
index 0000000..e9b203e
--- /dev/null
+++ b/concat.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+echo '<?xml version="1.0" encoding="utf-8"?>'
+echo '<xmlbible type="x-bible" biblename="Elberfelder 2006" status="v">'
+echo '<information>'
+echo '<title>Elberfelder 2006</title>'
+echo '<format>Zefania XML Bible Markup Language</format>'
+echo '</information>'
+j=1
+cat book-list | while read buch
+do
+ echo "<biblebook bname=\"$buch\" bnumber=\"$j\">"
+
+ i=1
+ while [ -e "chapters/$buch/$i" ]
+ do
+ cat "chapters/$buch/$i.xml" | sed 1d
+ i=$((i+1))
+ done
+
+ j=$((j+1))
+ echo "</biblebook>"
+done
+echo '</xmlbible>'
diff --git a/convert.sed b/convert.sed
new file mode 100755
index 0000000..2449d1f
--- /dev/null
+++ b/convert.sed
@@ -0,0 +1,31 @@
+#!/bin/sed -f
+
+/data-href/s/&/&amp;/g
+
+# xsltproc --html doesnt understand html5
+s/section/div/g
+s/header/h1/g
+s/<nav/<div/g
+s:</nav:</div:g
+s/footer/div/g
+s/article/div/g
+
+# Fix incorrect < and > inside p tags, that is by allowing only
+# known tag be surrounded by < and >.
+ta
+:a
+s/<p>\(.*\)<\/p>/\1/
+tfix
+b
+
+:fix
+s/</\&lt;/g
+s/>/\&gt;/g
+
+s/&lt;span\([^;]*\)&gt;/<span\1>/g
+s/&lt;\/span&gt;/<\/span>/g
+
+s/\&lt;em\&gt;/<em>/g
+s/\&lt;\/em\&gt;/<\/em>/g
+
+s:.*:<p>&</p>:
diff --git a/convert.xsl b/convert.xsl
new file mode 100644
index 0000000..d8f1d77
--- /dev/null
+++ b/convert.xsl
@@ -0,0 +1,88 @@
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+ <xsl:output method="xml" indent="yes" encoding="UTF-8" media-type="text/xml"/>
+
+ <xsl:template match="h2">
+ <caption vref="{following::span[@class='verse']}">
+ <xsl:apply-templates mode="copy" />
+ </caption>
+ </xsl:template>
+
+ <xsl:template match="span[@class='fussnote']" mode="copy-with-notes">
+ <note n1="x-studynote">
+ <xsl:variable name="id">
+ <xsl:value-of select="@data-param"/>
+ </xsl:variable>
+ <xsl:apply-templates select="//div[@id=$id]/div/p" mode="copy-with-notes"/>
+ </note>
+ </xsl:template>
+
+ <xsl:template match="text()" mode="copy-with-notes">
+ <xsl:if test="string-length(normalize-space(.)) > 0">
+ <xsl:value-of select="." />
+ </xsl:if>
+ </xsl:template>
+
+ <xsl:template match="em" mode="copy-with-notes">
+ <style fs="emphasize">
+ <xsl:value-of select="." />
+ </style>
+ </xsl:template>
+
+ <xsl:template match="br" mode="copy-with-notes">
+ <br art="x-nl" />
+ </xsl:template>
+
+<!--
+ <xsl:template match="*" mode="error">
+ <xsl:choose>
+ <xsl:when test="@class='fussnote'"/>
+ <xsl:when test="@class='verse'"/>
+ <xsl:when test="@class='chapter'"/>
+ <xsl:when test="local-name()='em'"/>
+ <xsl:when test="local-name()='br'"/>
+ <xsl:otherwise>
+ <xsl:message terminate="yes">
+ <xsl:value-of select="local-name()" />
+ <xsl:value-of select="@class" />
+ <xsl:apply-templates select="." mode="copy" />
+ </xsl:message>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+-->
+ <xsl:template match="p[span/@class='verse']">
+ <vers>
+ <xsl:attribute name="vnumber">
+ <xsl:value-of select="span[@class='verse']"/>
+ </xsl:attribute>
+ <xsl:apply-templates select="text()|span[@class='fussnote']|em|br" mode="copy-with-notes" />
+ <!--<xsl:apply-templates select="*" mode="error" />-->
+ </vers>
+ </xsl:template>
+
+ <xsl:template match="div[@class='annotation']">
+ </xsl:template>
+
+
+ <xsl:template match="div[@class='markdown']">
+ <xsl:apply-templates select="*" />
+ </xsl:template>
+
+
+ <xsl:template match="/">
+ <chapter cnumber="{//span[@class='chapter']}">
+ <xsl:apply-templates select="//div[@class='markdown']"/>
+ </chapter>
+ </xsl:template>
+
+ <xsl:template match="*" mode="copy">
+ <xsl:element name="{local-name()}">
+ <xsl:apply-templates mode="copy" select="@*|node()" />
+ </xsl:element>
+ </xsl:template>
+ <xsl:template match="@*" mode="copy">
+ <xsl:attribute name="{local-name()}">
+ <xsl:value-of select="." />
+ </xsl:attribute>
+ </xsl:template>
+</xsl:stylesheet>
diff --git a/download-books.sh b/download-books.sh
new file mode 100755
index 0000000..125c29c
--- /dev/null
+++ b/download-books.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+rm -rf books/
+mkdir books
+echo -n > book-list
+
+curl -s http://www.die-bibel.de/online-bibeln/elberfelder-bibel/bibeltext/ | \
+ ./parse-book.sed | \
+ while read url
+ do
+ read book
+ book=$(echo $book | sed s:/.*$:: )
+ echo $book
+ echo $book >> book-list
+ curl $url > "books/$book"
+
+ done
diff --git a/download-chapters.sh b/download-chapters.sh
new file mode 100755
index 0000000..ba29d8e
--- /dev/null
+++ b/download-chapters.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+mkdir -p chapters/
+
+cat book-list | while read buch
+do
+ mkdir -p "chapters/$buch"
+ ./parse-chapter.sed "books/$buch" | \
+ while read url
+ do
+ read number
+ echo $url
+
+ echo "$buch $number"
+ curl $url > "chapters/$buch/$number"
+ done
+done
diff --git a/download.sh b/download.sh
new file mode 100755
index 0000000..0e80054
--- /dev/null
+++ b/download.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+./download-books.sh
+./download-chapters.sh
diff --git a/gen-bible.sh b/gen-bible.sh
new file mode 100755
index 0000000..c6e56df
--- /dev/null
+++ b/gen-bible.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+./make-xml.sh
+./concat.sh > elberfelder2006.xml
+
+zip elberfelder2006.zip elberfelder2006.xml
diff --git a/list-chapters.sh b/list-chapters.sh
new file mode 100755
index 0000000..0964982
--- /dev/null
+++ b/list-chapters.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+cat book-list | while read buch
+do
+ i=1
+ while [ -e "chapters/$buch/$i" ]
+ do
+ echo "chapters/$buch/$i"
+ i=$((i+1))
+ done
+done
diff --git a/make-xml.sh b/make-xml.sh
new file mode 100755
index 0000000..dace25c
--- /dev/null
+++ b/make-xml.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+./list-chapters.sh | while read file
+do
+ echo "$file"
+ ./convert.sed "$file" | \
+ xsltproc --encoding utf-8 --html convert.xsl - > "${file}.xml"
+done
diff --git a/parse-book.sed b/parse-book.sed
new file mode 100755
index 0000000..d46191c
--- /dev/null
+++ b/parse-book.sed
@@ -0,0 +1,4 @@
+#!/bin/sed -nf
+
+/>&[gl]t;<\/a>/d
+s:^.*href="\(.*lesen/stelle/[0-9][0-9]*///ch/[^"]*\)">\([^<]*\)</a>.*$:\1\n\2:p
diff --git a/parse-chapter.sed b/parse-chapter.sed
new file mode 100755
index 0000000..614d2cf
--- /dev/null
+++ b/parse-chapter.sed
@@ -0,0 +1,4 @@
+#!/bin/sed -nf
+
+/>&[gl]t;<\/a>/d
+s:^.*href="\(.*lesen/stelle/[0-9][0-9]*/[0-9][0-9]*/[0-9][0-9]*/ch/[^"]*\)">\([^<]*\)</a>.*$:\1\n\2:p