Hi,
as I am rewriting some of my old python 2 programs I have updated my vorleser.net download script. This time, I am using beautiful-soup and python 3 which makes a lot easier.
I also added a COMPARE_FOLDER, so it will only download files which are not in your collection already.
Vorleser.net
"""python program to download all audiobooks from vorleser.net"""
__author__ = "Frederik Lauber"
__copyright__ = "Copyright 2014"
__license__ = "GPL3"
__version__ = "0.5"
__maintainer__ = "Frederik Lauber"
__status__ = "beta"
from urllib.request import urlopen, urlretrieve
import os
from bs4 import BeautifulSoup
import re
def clean_folder_name(folder_name):
"""removes invalid symbols from a path and replaces them with whitespaces"""
return re.sub(r'[,:\.\?]', ' ', folder_name).strip()
def url2soup(url):
"""helper to directly get a soup object"""
data = urlopen(url)
data_read = data.read()
return BeautifulSoup(data_read)
BASE_URL = "http://vorleser.net/"
AUTHOR_URL = "http://vorleser.net/alle_autoren.php"
DOWNLOAD_FOLDER = "C:\\download_folder\\"
COMPARE_FOLDER = "C:\\already_here_folder\\"
def get_autor_links():
"""Lookup: autor name to link to autor page"""
soup_autor = url2soup(AUTHOR_URL)
autor_link_dict = dict()
for link in soup_autor.find_all('a'):
if "autor.php?id" in link.get("href"):
autor_link_dict[clean_folder_name(link.get_text())] = \
link.get("href")
return autor_link_dict
def get_werke_links(url):
"""Lookup: werk name to download link of werk"""
soup_werke = url2soup(BASE_URL + url)
werke_link_dict = dict()
for div in soup_werke.findAll("div", { "id" : "dWerke" }):
for werk_list_url in div.findAll("a"):
werk_name = werk_list_url.get_text()
soup_download = url2soup(BASE_URL + werk_list_url.get("href"))
for a_tag in soup_download.findAll("a"):
href = a_tag.get("href")
if href.startswith("d.php?d=download.php&f="):
werke_link_dict[clean_folder_name(\
werk_name.strip())] = href[23:]
return werke_link_dict
def parse_autor_name(autor_string):
"""changes name scheme "Lastname, Firstname" to "Firstname Lastname" """
(lastname, firstname) = autor_string.split(",")
return (firstname.strip() + " " + lastname.strip()).strip()
def download_if_not_existent(autor_name, werk_name, url):
"""Creates folders for autor and werk if not existant and"""
"""will download the given url to the folder if no files exists"""
"""already"""
autor_folder = os.path.join(DOWNLOAD_FOLDER, autor_name)
werk_folder = os.path.join(autor_folder, werk_name)
file_path = os.path.join(werk_folder, "01" + url[-4:])
if not os.path.exists(os.path.join(COMPARE_FOLDER, autor_name, werk_name)):
if not os.path.exists(autor_folder):
os.makedirs(autor_folder)
if not os.path.exists(werk_folder):
os.makedirs(werk_folder)
if not os.path.exists(file_path):
urlretrieve(url, file_path)
else:
print("\tAlready Exists")
if __name__ == "__main__":
print("START")
for autor, autor_url in get_autor_links().items():
author_name = parse_autor_name(autor)
for werk, werk_url in get_werke_links(autor_url).items():
download_if_not_existent(author_name, werk, werk_url)
print("STOP")