Hi,

as I am rewriting some of my old python 2 programs I have updated my vorleser.net download script. This time, I am using beautiful-soup and python 3 which makes a lot easier.

I also added a COMPARE_FOLDER, so it will only download files which are not in your collection already.

Vorleser.net

"""python program to download all audiobooks from vorleser.net"""
__author__ = "Frederik Lauber"
__copyright__ = "Copyright 2014"
__license__ = "GPL3"
__version__ = "0.5"
__maintainer__ = "Frederik Lauber"
__status__ = "beta"
from urllib.request import urlopen, urlretrieve
import os
from bs4 import BeautifulSoup
import re

def clean_folder_name(folder_name):
    """removes invalid symbols from a path and replaces them with whitespaces"""
    return re.sub(r'[,:\.\?]', ' ', folder_name).strip()

def url2soup(url):
    """helper to directly get a soup object"""
    data = urlopen(url)
    data_read = data.read()
    return BeautifulSoup(data_read)

BASE_URL = "http://vorleser.net/"
AUTHOR_URL = "http://vorleser.net/alle_autoren.php"
DOWNLOAD_FOLDER = "C:\\download_folder\\"
COMPARE_FOLDER = "C:\\already_here_folder\\"

def get_autor_links():
    """Lookup: autor name to link to autor page"""
    soup_autor = url2soup(AUTHOR_URL)
    autor_link_dict = dict()
    for link in soup_autor.find_all('a'):
        if "autor.php?id" in link.get("href"):
            autor_link_dict[clean_folder_name(link.get_text())] = \
                link.get("href")
    return autor_link_dict

def get_werke_links(url):
    """Lookup: werk name to download link of werk"""
    soup_werke = url2soup(BASE_URL + url)
    werke_link_dict = dict()
    for div in soup_werke.findAll("div", { "id" : "dWerke" }):
        for werk_list_url in div.findAll("a"):
            werk_name = werk_list_url.get_text()
            soup_download = url2soup(BASE_URL + werk_list_url.get("href"))
            for a_tag in soup_download.findAll("a"):
                href = a_tag.get("href")
                if href.startswith("d.php?d=download.php&f="):
                    werke_link_dict[clean_folder_name(\
                        werk_name.strip())] = href[23:]
    return werke_link_dict

def parse_autor_name(autor_string):
    """changes name scheme "Lastname, Firstname" to "Firstname Lastname" """
    (lastname, firstname) = autor_string.split(",")
    return (firstname.strip() + " " + lastname.strip()).strip()

def download_if_not_existent(autor_name, werk_name, url):
    """Creates folders for autor and werk if not existant and"""
    """will download the given url to the folder if no files exists"""
    """already"""
    autor_folder = os.path.join(DOWNLOAD_FOLDER, autor_name)
    werk_folder = os.path.join(autor_folder, werk_name)
    file_path = os.path.join(werk_folder, "01" + url[-4:])
    if not os.path.exists(os.path.join(COMPARE_FOLDER, autor_name, werk_name)):
        if not os.path.exists(autor_folder):
            os.makedirs(autor_folder)
        if not os.path.exists(werk_folder):
            os.makedirs(werk_folder)
        if not os.path.exists(file_path):
            urlretrieve(url, file_path)
        else:
            print("\tAlready Exists")

if __name__ == "__main__":
    print("START")
    for autor, autor_url in get_autor_links().items():
        author_name = parse_autor_name(autor)
        for werk, werk_url in get_werke_links(autor_url).items():
            download_if_not_existent(author_name, werk, werk_url)
    print("STOP")

Published

Category

snippets

Tags