as I am rewritting some of my old python 2 programs I have updated my vorleser.net download script. This time, I am using beautifulsoup and python 3 which makes a lot easier.

I also added a COMPARE_FOLDER so it will only download files which are not in your collection already.


"""python program to download all audiobooks from vorleser.net"""
__author__ = "Frederik Lauber"
__copyright__ = "Copyright 2014"
__license__ = "GPL3"
__version__ = "0.5"
__maintainer__ = "Frederik Lauber"
__status__ = "beta"
from urllib.request import urlopen, urlretrieve
import os
from bs4 import BeautifulSoup
import re

def clean_folder_name(folder_name):
	"""removes invalid symbols from a path and replaces them with whitespaces"""
	return re.sub(r'[,:\.\?]', ' ', folder_name).strip()

def url2soup(url):
	"""helper to directly get a soup object"""
	data = urlopen(url)
	data_read = data.read()
	return BeautifulSoup(data_read)

BASE_URL = "http://vorleser.net/"
AUTHOR_URL = "http://vorleser.net/alle_autoren.php"
DOWNLOAD_FOLDER = "C:\\download_folder\\"
COMPARE_FOLDER = "C:\\already_here_folder\\"

def get_autor_links():
	"""Lookup: autor name to link to autor page"""
	soup_autor = url2soup(AUTHOR_URL)
	autor_link_dict = dict()
	for link in soup_autor.find_all('a'):
		if "autor.php?id" in link.get("href"):
			autor_link_dict[clean_folder_name(link.get_text())] = \
	return autor_link_dict

def get_werke_links(url):
	"""Lookup: werk name to download link of werk"""
	soup_werke = url2soup(BASE_URL + url)
	werke_link_dict = dict()
	for div in soup_werke.findAll("div", { "id" : "dWerke" }):
		for werk_list_url in div.findAll("a"):
			werk_name = werk_list_url.get_text()
			soup_download = url2soup(BASE_URL + werk_list_url.get("href"))
			for a_tag in soup_download.findAll("a"):
				href = a_tag.get("href")
				if href.startswith("d.php?d=download.php&f="):
						werk_name.strip())] = href[23:]
	return werke_link_dict

def parse_autor_name(autor_string):
	"""changes name scheme "Lastname, Firstname" to "Firstname Lastname" """
	(lastname, firstname) = autor_string.split(",")
	return (firstname.strip() + " " + lastname.strip()).strip()

def download_if_not_existent(autor_name, werk_name, url):
	"""Creates folders for autor and werk if not existant and"""
	"""will download the given url to the folder if no files exists"""
	autor_folder = os.path.join(DOWNLOAD_FOLDER, autor_name)
	werk_folder = os.path.join(autor_folder, werk_name)
	file_path = os.path.join(werk_folder, "01" + url[-4:])
	if not os.path.exists(os.path.join(COMPARE_FOLDER, autor_name, werk_name)):
		if not os.path.exists(autor_folder):
		if not os.path.exists(werk_folder):
		if not os.path.exists(file_path):
			urlretrieve(url, file_path)
			print("\tAlready Exists")

if __name__ == "__main__":
	for autor, autor_url in get_autor_links().items():
		author_name = parse_autor_name(autor)
		for werk, werk_url in get_werke_links(autor_url).items():
			download_if_not_existent(author_name, werk, werk_url)