Hi,
as I am rewritting some of my old python 2 programs I have updated
my vorleser.net download script. This time, I am using beautifulsoup and
python 3 which makes a lot easier.
I also added a COMPARE_FOLDER so it will only download files which are not
in your collection already.
Vorleser.net
"""python program to download all audiobooks from vorleser.net"""
__author__ = "Frederik Lauber"
__copyright__ = "Copyright 2014"
__license__ = "GPL3"
__version__ = "0.5"
__maintainer__ = "Frederik Lauber"
__status__ = "beta"
from urllib.request import urlopen , urlretrieve
import os
from bs4 import BeautifulSoup
import re
def clean_folder_name ( folder_name ):
"""removes invalid symbols from a path and replaces them with whitespaces"""
return re . sub ( r'[,:\.\?]' , ' ' , folder_name ). strip ()
def url2soup ( url ):
"""helper to directly get a soup object"""
data = urlopen ( url )
data_read = data . read ()
return BeautifulSoup ( data_read )
BASE_URL = "http://vorleser.net/"
AUTHOR_URL = "http://vorleser.net/alle_autoren.php"
DOWNLOAD_FOLDER = "C: \\ download_folder \\ "
COMPARE_FOLDER = "C: \\ already_here_folder \\ "
def get_autor_links ():
"""Lookup: autor name to link to autor page"""
soup_autor = url2soup ( AUTHOR_URL )
autor_link_dict = dict ()
for link in soup_autor . find_all ( 'a' ):
if "autor.php?id" in link . get ( "href" ):
autor_link_dict [ clean_folder_name ( link . get_text ())] = \
link . get ( "href" )
return autor_link_dict
def get_werke_links ( url ):
"""Lookup: werk name to download link of werk"""
soup_werke = url2soup ( BASE_URL + url )
werke_link_dict = dict ()
for div in soup_werke . findAll ( "div" , { "id" : "dWerke" }):
for werk_list_url in div . findAll ( "a" ):
werk_name = werk_list_url . get_text ()
soup_download = url2soup ( BASE_URL + werk_list_url . get ( "href" ))
for a_tag in soup_download . findAll ( "a" ):
href = a_tag . get ( "href" )
if href . startswith ( "d.php?d=download.php&f=" ):
werke_link_dict [ clean_folder_name ( \
werk_name . strip ())] = href [ 23 :]
return werke_link_dict
def parse_autor_name ( autor_string ):
"""changes name scheme "Lastname, Firstname" to "Firstname Lastname" """
( lastname , firstname ) = autor_string . split ( "," )
return ( firstname . strip () + " " + lastname . strip ()). strip ()
def download_if_not_existent ( autor_name , werk_name , url ):
"""Creates folders for autor and werk if not existant and"""
"""will download the given url to the folder if no files exists"""
"""already"""
autor_folder = os . path . join ( DOWNLOAD_FOLDER , autor_name )
werk_folder = os . path . join ( autor_folder , werk_name )
file_path = os . path . join ( werk_folder , "01" + url [ - 4 :])
if not os . path . exists ( os . path . join ( COMPARE_FOLDER , autor_name , werk_name )):
if not os . path . exists ( autor_folder ):
os . makedirs ( autor_folder )
if not os . path . exists ( werk_folder ):
os . makedirs ( werk_folder )
if not os . path . exists ( file_path ):
urlretrieve ( url , file_path )
else :
print ( " \t Already Exists" )
if __name__ == "__main__" :
print ( "START" )
for autor , autor_url in get_autor_links (). items ():
author_name = parse_autor_name ( autor )
for werk , werk_url in get_werke_links ( autor_url ). items ():
download_if_not_existent ( author_name , werk , werk_url )
print ( "STOP" )