Source code for webdavclient.webdavclient

import os
from loguru import logger
import urllib.parse
from xml.etree import ElementTree
import requests
from requests.auth import HTTPBasicAuth


[docs] class WebDavClient: """ A simple WebDav client for downloading files and complete folder hierarchies from a remote host (f.e. cloud provider). Attributes: hostname (str) : full address to webdav host username (str) : username of connection password (str) : most properly a token generated for AUTH Methods: download_all_files_iterative(a,b): Downloads all files from a and stroes them under b locally. subtract(a, b): Returns the difference between a and b. """ def __init__(self, hostname: str, username: str, password: str) -> None: self.hostname: str = hostname self.username: str = username self.password: str = password self.auth: HTTPBasicAuth = HTTPBasicAuth( username, password, ) self.ensure_folder_exists("logs") logger.add("logs/downloadMd_s.log", rotation="500 MB")
[docs] def list_files(self, url: str) -> list[str]: """ This method list all files from your WebDav host that stay under the url Args: url (str) : web dev host url. Returns: type: list[str] list of all files who stay under the url (no recursion) Raises: OSError Examples: Example usage of the method: >>> hostname = "https://yourhost.de/webdav" >>> username = "Schlawiner23" >>> password = "YOUR_PERSONAL_TOKEN" >>> remote_base_path = "MyProjectFolder" >>> auth: HTTPBasicAuth = HTTPBasicAuth(username, password) >>> webdevclient = WebDavClient(hostname, username, password) >>> files = webdevclient.list_files(os.path.join(hostname, remote_base_path)) """ headers: dict[str, str] = {"Content-Type": "application/xml", "Depth": "1"} response: requests.Response = requests.request( "PROPFIND", url, auth=self.auth, headers=headers ) if response.status_code != 207: error_message: str = "Failed to list directory contents via requests: " logger.error( "Error message {} with code {}", error_message, response.status_code ) raise OSError(f"{error_message}{response.status_code}") tree = ElementTree.fromstring(response.content) files = [] for response in tree.findall("{DAV:}response"): href = response.find("{DAV:}href").text if not href.endswith("/"): logger.debug("Found file: {} for the following url: {}", href, url) files.append(href) return files
[docs] def list_folders(self, remote_base_path: str) -> list[str]: """ This method list all folders from your WebDav host that stay EXACTLY under the remote_base_path. No subfolders are considered. Args: remote_base_path (str) : Folder on host for which the folders should be listed Returns: type: list[str] list of all folders who stay under the parent folder Raises: OSError Examples: Example usage of the method: >>> hostname = "https://yourhost.de/webdav" >>> username = "Schlawiner23" >>> password = "YOUR_PERSONAL_TOKEN" >>> remote_base_path = "MyProjectFolder" >>> webdevclient = WebDavClient(args.hostname, args.username, args.password) >>> webdevclient.list_folders(remote_base_path) """ headers = {"Content-Type": "application/xml", "Depth": "1"} url: str = os.path.join(self.hostname, remote_base_path) # as we communicate we do not want WINDWOS \ as os.sep! url = url.replace(os.sep, "/") response = requests.request("PROPFIND", url, auth=self.auth, headers=headers) if response.status_code != 207: error_message = "Failed to list directory contents: " logger.error("{} {}", error_message, response.status_code) raise OSError(f"{error_message}{response.status_code}") tree = ElementTree.fromstring(response.content) folders = [] for response in tree.findall("{DAV:}response"): href = response.find("{DAV:}href").text folder = os.path.basename(os.path.normpath(href)) is_folder = href.endswith("/") is_not_remote_base_path = ( href != url + "/" ) and folder != remote_base_path.split("/")[-1] is_hidden_folder = folder.startswith(".") if is_folder and is_not_remote_base_path and not is_hidden_folder: logger.debug( "Found folder: {} in the parent folder: {}", folder, remote_base_path, ) folders.append(folder) return folders
[docs] def filter_after_global_base_path(self, path: str, remote_base_path: str) -> str: """ This method removes the hostname and the remote_base_path from an path Args: path (str) : Url to host, e.g. https://host.org remote_base_path (str): single folder on remote host e.g. data Returns: type: str Raises: None directly Example: host-url= https://host.org/ remote_base_path = data path = https://host.org/data/example1 "example1" ist returned """ search_str = "/" + remote_base_path + "/" if search_str in path: logger.debug( "Found folder {} for path {} and remote base path: {}", search_str, path, remote_base_path, ) url_after_removing_everything_before_and_including_remote_base_name = ( path.split(search_str, 1)[1] ) logger.info( "Folder structure everything after the remote_base_path is: {}", url_after_removing_everything_before_and_including_remote_base_name, ) return url_after_removing_everything_before_and_including_remote_base_name logger.error("Could not find search string: {} in path: {}", search_str, path) return path
[docs] def ensure_folder_exists(self, path: str) -> None: """ This method ensures that the given folder will exist Args: path (str) : Path to folder. Returns: type: None Raises: None directly """ if not os.path.exists(path): os.makedirs(path) logger.debug("Folder created: {}", path) else: logger.debug("Folder already exists: {}", path)
[docs] def get_sub_path(self, full_path: str, initial_part: str) -> str: """ Returns the sub-path after the initial part of the path. Args: full_path (str): The full path string. Does NOT have host url within initial_part (str): The initial part of the path string to be removed. Returns: type: str: The sub-path string after the initial part. Example 1: full_path = /data/subfolder1/text.txt initial_part (str) = data returns ==> subfolder1/text.txt Raises: ValueError """ # Decode URL-encoded parts of the path logger.debug("We are in the 'get_sub_path' method.") decoded_full_path = urllib.parse.unquote(full_path) logger.debug("The decoded full file path is: {}", decoded_full_path) decoded_initial_part = urllib.parse.unquote(initial_part) logger.debug("The decoded initial file path is: {}", decoded_initial_part) # Ensure the initial part ends with a slash # removes weird edge cases for later processing if not decoded_initial_part.endswith("/"): decoded_initial_part += "/" # Find the position where the initial part ends in the full path start_idx = decoded_full_path.find(decoded_initial_part) if start_idx == -1: logger.error( "The {} string is not in the full_path={}", initial_part, full_path ) raise ValueError("The full path does not contain the initial part.") # Handle the edge case where the full path is exactly the initial part decoded_initial_part = "/" + decoded_initial_part if full_path == decoded_initial_part.rstrip("/"): logger.debug("The get_sub_path() method returns empty string") return "" # Remove the initial part from the full path if full_path.startswith(initial_part): logger.debug( "The get_sub_path() method returns {}", full_path[len(initial_part) :] ) return full_path[len(initial_part) :] else: # Calculate the start index of the sub-path sub_path_start_idx = start_idx + len(decoded_initial_part) - 1 # Extract the sub-path sub_path = decoded_full_path[sub_path_start_idx:] logger.debug("The get_sub_path() method returns {}", sub_path) return sub_path
[docs] def download_files( self, global_remote_base_path: str, remote_base_path: str, local_base_path: str, ) -> None: """ This method downloads all files from your WebDav host that stay EXACTLY under the remote_base_path. No subfolders are considered. Args: remote_base_path (str): Folder on host which should be primary source for downloading files local_base_path (str) : all files (with preserved folder structures) are put inside this local path Returns: type: None Raises: None directly Examples: Example usage of the method: >>> hostname = "https://yourhost.de/webdav" >>> username = "Schlawiner23" >>> password = "YOUR_PERSONAL_TOKEN" >>> remote_base_path = "MyProjectFolder" >>> local_base_path = "assets" >>> auth: HTTPBasicAuth = HTTPBasicAuth(username, password) >>> download_files(hostname, auth, current_remote_path, local_base_path) """ if not os.path.exists(local_base_path): logger.info("Dir {} will be created", local_base_path) os.makedirs(local_base_path) url = os.path.join(self.hostname, remote_base_path) # as we communicate we do not want WINDWOS \ as os.sep! url = url.replace(os.sep, "/") files_on_host = self.list_files(url) if len(files_on_host) == 0: logger.info("Found no files on remote_base_path: {}", remote_base_path) for file_path in files_on_host: logger.info("Found the file: {} on current remote_base_path", file_path) file_name = self.filter_after_global_base_path(file_path, remote_base_path) logger.info("The pure of filename of this file is: {}", file_name) # Decoding the URL-encoded string decoded_filename = urllib.parse.unquote(file_name) logger.info("The decoded filename version is: {}", decoded_filename) remote_file_url = os.path.join( self.hostname, remote_base_path, file_name # file_path.split("/")[-1] ) remote_file_url = remote_file_url.replace(os.sep, "/") logger.info("The remote file url is: {}", remote_file_url) sub_path = self.get_sub_path(file_path, global_remote_base_path) if sub_path.endswith(decoded_filename): sub_path = sub_path[: len(sub_path) - len(decoded_filename)] if sub_path == decoded_filename: sub_path = "" logger.debug("The current sub path is: {}", sub_path) local_file_path = os.path.join(local_base_path, sub_path, decoded_filename) local_file_path = local_file_path.replace(os.sep, "/") logger.debug( "The current file that is stored has the full path: {}", local_file_path ) response = requests.get(remote_file_url, auth=self.auth, stream=True) if response.status_code == 200: folder_path = os.path.dirname(local_file_path) self.ensure_folder_exists(folder_path) with open(local_file_path, "wb") as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) else: logger.debug( "Failed to download {}: {}", remote_file_url, response.status_code )
[docs] def download_all_files_iterative( self, remote_base_path: str, local_base_path: str, ) -> None: """ This method downloads all files from your WebDav host that stay under the remote_base_path. All subfolders will also be downloaded and folder structure is preserved Args: remote_base_path (str): Folder on host which should be primary source for downloading files local_base_path (str) : all files (with preserved folder structures) are put inside this local path Returns: type: None Raises: None directly Examples: Example usage of the method: >>> hostname = "https://yourhost.de/webdav" >>> username = "Schlawiner23" >>> password = "YOUR_PERSONAL_TOKEN" >>> remote_base_path = "MyProjectFolder" >>> local_base_path = "assets" >>> webdevclient = WebDavClient(args.hostname, args.username, args.password) >>> webdevclient.download_all_files_iterative( >>> args.remote_base_path, args.local_base_path >>> ) """ # Initialize the stack with the root directory stack: list[str] = [remote_base_path] global_remote_base_path: str = remote_base_path while stack: current_remote_path: str = stack.pop() logger.debug("Current remote path is: {}", current_remote_path) # Download files in the current directory self.download_files( global_remote_base_path, current_remote_path, local_base_path, ) # List all folders in the current remote path folders: list[str] = self.list_folders(current_remote_path) if len(folders) == 0: logger.info( "Found no subfolders for current folder: {}", current_remote_path ) # Add each subfolder to the stack for folder in folders: logger.info( "Found subfolder {} for current folder: {}.", folder, current_remote_path, ) relative_folder_path: str = os.path.join(current_remote_path, folder) relative_folder_path = relative_folder_path.replace(os.sep, "/") stack.append(relative_folder_path)