2121import logging
2222import os
2323import shutil
24+ import subprocess
25+ import urllib .parse
2426from typing import Optional , Union
2527
2628import click
@@ -213,7 +215,7 @@ def download_station(
213215 check_exists = True ,
214216 )
215217 # Download data
216- _download_station_data (metadata_filepath , data_archive_dir = data_archive_dir , force = force )
218+ download_station_data (metadata_filepath , data_archive_dir = data_archive_dir , force = force )
217219
218220
219221def _is_valid_disdrodb_data_url (disdrodb_data_url ):
@@ -228,13 +230,25 @@ def _extract_station_files(zip_filepath, station_dir):
228230 os .remove (zip_filepath )
229231
230232
231- def _download_station_data (metadata_filepath : str , data_archive_dir : str , force : bool = False ) -> None :
233+ def check_consistent_station_name (metadata_filepath , station_name ):
234+ """Check consistent station_name between YAML file name and metadata key."""
235+ # Check consistent station name
236+ expected_station_name = os .path .basename (metadata_filepath ).replace (".yml" , "" )
237+ if station_name and str (station_name ) != str (expected_station_name ):
238+ raise ValueError (f"Inconsistent station_name values in the { metadata_filepath } file. Download aborted." )
239+ return station_name
240+
241+
242+ def download_station_data (metadata_filepath : str , data_archive_dir : str , force : bool = False ) -> None :
232243 """Download and unzip the station data .
233244
234245 Parameters
235246 ----------
236247 metadata_filepaths : str
237248 Metadata file path.
249+ data_archive_dir : str (optional)
250+ DISDRODB Data Archive directory. Format: ``<...>/DISDRODB``.
251+ If ``None`` (the default), the disdrodb config variable ``data_archive_dir`` is used.
238252 force : bool, optional
239253 If ``True``, delete existing files and redownload it. The default value is ``False``.
240254
@@ -247,7 +261,7 @@ def _download_station_data(metadata_filepath: str, data_archive_dir: str, force:
247261 campaign_name = metadata_dict ["campaign_name" ]
248262 station_name = metadata_dict ["station_name" ]
249263 station_name = check_consistent_station_name (metadata_filepath , station_name )
250- # Define the destination local filepath path
264+ # Define the path to the station RAW data directory
251265 station_dir = define_station_dir (
252266 data_archive_dir = data_archive_dir ,
253267 data_source = data_source ,
@@ -259,19 +273,136 @@ def _download_station_data(metadata_filepath: str, data_archive_dir: str, force:
259273 disdrodb_data_url = metadata_dict .get ("disdrodb_data_url" , None )
260274 if not _is_valid_disdrodb_data_url (disdrodb_data_url ):
261275 raise ValueError (f"Invalid disdrodb_data_url '{ disdrodb_data_url } ' for station { station_name } " )
262- # Download file
263- zip_filepath = _download_file_from_url (disdrodb_data_url , dst_dir = station_dir , force = force )
264- # Extract the stations files from the downloaded station.zip file
265- _extract_station_files (zip_filepath , station_dir = station_dir )
266276
277+ # Download files
278+ # - Option 1: Zip file from Zenodo containing all station raw data
279+ if disdrodb_data_url .startswith ("https://zenodo.org/" ):
280+ download_zenodo_zip_file (url = disdrodb_data_url , dst_dir = station_dir , force = force )
281+ # - Option 2: Recursive download from a web server via HTTP or HTTPS.
282+ elif disdrodb_data_url .startswith ("http" ):
283+ download_web_server_data (url = disdrodb_data_url , dst_dir = station_dir , force = force , verbose = True )
284+ else :
285+ raise NotImplementedError (f"Open a GitHub Issue to enable the download of data from { disdrodb_data_url } ." )
267286
268- def check_consistent_station_name (metadata_filepath , station_name ):
269- """Check consistent station_name between YAML file name and metadata key."""
270- # Check consistent station name
271- expected_station_name = os .path .basename (metadata_filepath ).replace (".yml" , "" )
272- if station_name and str (station_name ) != str (expected_station_name ):
273- raise ValueError (f"Inconsistent station_name values in the { metadata_filepath } file. Download aborted." )
274- return station_name
287+
288+ ####-----------------------------------------------------------------------------------------.
289+ #### Download from Web Server via HTTP or HTTPS
290+
291+
292+ def download_web_server_data (url : str , dst_dir : str , force = True , verbose = True ) -> None :
293+ """Download data from a web server via HTTP or HTTPS.
294+
295+ Use the system's wget command to recursively download all files and subdirectories
296+ under the given HTTPS “directory” URL. Works on both Windows and Linux, provided
297+ that wget is installed and on the PATH.
298+
299+ 1. Ensure wget is available.
300+ 2. Normalize URL to end with '/'.
301+ 3. Compute cut-dirs so that only the last segment of the path remains locally.
302+ 4. Build and run the wget command.
303+
304+ Example:
305+ download_with_wget("https://ruisdael.citg.tudelft.nl/parsivel/PAR001_Cabauw/2021/202101/")
306+ # → Creates a local folder "202101/" with all files and subfolders.
307+ """
308+ # 1. Ensure wget exists
309+ ensure_wget_available ()
310+
311+ # 2. Normalize URL
312+ url = ensure_trailing_slash (url )
313+
314+ # 3. Compute cut-dirs so that only the last URL segment remains locally
315+ cut_dirs = compute_cut_dirs (url )
316+
317+ # 4. Create destination directory if needed
318+ os .makedirs (dst_dir , exist_ok = True )
319+
320+ # 5. Build wget command
321+ cmd = build_webserver_wget_command (url , cut_dirs = cut_dirs , dst_dir = dst_dir , force = force , verbose = verbose )
322+
323+ # 6. Run wget command
324+ try :
325+ subprocess .run (cmd , check = True )
326+ except subprocess .CalledProcessError as e :
327+ raise subprocess .CalledProcessError (
328+ returncode = e .returncode ,
329+ cmd = e .cmd ,
330+ output = e .output ,
331+ stderr = e .stderr ,
332+ )
333+
334+
335+ def ensure_wget_available () -> None :
336+ """Raise FileNotFoundError if 'wget' is not on the system PATH."""
337+ if shutil .which ("wget" ) is None :
338+ raise FileNotFoundError ("The WGET software was not found. Please install WGET or add it to PATH." )
339+
340+
341+ def ensure_trailing_slash (url : str ) -> str :
342+ """Return `url` guaranteed to end with a slash."""
343+ return url if url .endswith ("/" ) else url .rstrip ("/" ) + "/"
344+
345+
346+ def compute_cut_dirs (url : str ) -> int :
347+ """Compute the wget cut_dirs value to download directly in `dst_dir`.
348+
349+ Given a URL ending with '/', compute the total number of path segments.
350+ By returning len(segments), we strip away all of them—so that files
351+ within that final directory land directly in `dst_dir` without creating
352+ an extra subfolder.
353+ """
354+ parsed = urllib .parse .urlparse (url )
355+ path = parsed .path .strip ("/" ) # remove leading/trailing '/'
356+ segments = path .split ("/" ) if path else []
357+ return len (segments )
358+
359+
360+ def build_webserver_wget_command (url : str , cut_dirs : int , dst_dir : str , force : bool , verbose : bool ) -> list [str ]:
361+ """Construct the wget command list for subprocess.run.
362+
363+ Notes
364+ -----
365+ The following wget arguments are used
366+ - -q : quiet mode (no detailed progress)
367+ - -r : recursive
368+ - -np : no parent
369+ - -nH : no host directories
370+ - --timestamping: download missing files or when remote version is newer
371+ - --cut-dirs : strip all but the last path segment from the remote path
372+ - -P dst_dir : download into `dst_dir`
373+ - url
374+ """
375+ cmd = ["wget" ]
376+ if verbose :
377+ cmd .append ("-q" )
378+ cmd += [
379+ "-r" ,
380+ "-np" ,
381+ "-nH" ,
382+ f"--cut-dirs={ cut_dirs } " ,
383+ ]
384+ if force :
385+ cmd .append ("--timestamping" ) # -N
386+
387+ # Define source and destination directory
388+ cmd += [
389+ "-P" ,
390+ dst_dir ,
391+ url ,
392+ ]
393+ return cmd
394+
395+
396+ ####--------------------------------------------------------------------.
397+ #### Download from Zenodo
398+
399+
400+ def download_zenodo_zip_file (url , dst_dir , force ):
401+ """Download zip file from zenodo and extract station raw data."""
402+ # Download zip file
403+ zip_filepath = _download_file_from_url (url , dst_dir = dst_dir , force = force )
404+ # Extract the stations files from the downloaded station.zip file
405+ _extract_station_files (zip_filepath , station_dir = dst_dir )
275406
276407
277408def _download_file_from_url (url : str , dst_dir : str , force : bool = False ) -> str :
0 commit comments