Viewing file: download_pypi_packages.py (2.56 KB) -rwxr-xr-x Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
#!/usr/bin/env python3.8
import argparse import os import json
from typing import Dict, Any from urllib.request import urlretrieve
argparser = argparse.ArgumentParser( prog="download_pypi_packages", description="Helper program to download PyPI packages", ) argparser.add_argument( "-n", "--number", type=int, default=100, help="Number of packages to download" ) argparser.add_argument( "-a", "--all", action="store_true", help="Download all packages listed in the json file" )
def load_json(filename: str) -> Dict[Any, Any]: with open(os.path.join("data", f"{filename}.json"), "r") as f: j = json.loads(f.read()) return j
def remove_json(filename: str) -> None: path = os.path.join("data", f"{filename}.json") os.remove(path)
def download_package_json(package_name: str) -> None: url = f"https://pypi.org/pypi/{package_name}/json" urlretrieve(url, os.path.join("data", f"{package_name}.json"))
def download_package_code(name: str, package_json: Dict[Any, Any]) -> None: source_index = -1 for idx, url_info in enumerate(package_json["urls"]): if url_info["python_version"] == "source": source_index = idx break filename = package_json["urls"][source_index]["filename"] url = package_json["urls"][source_index]["url"] urlretrieve(url, os.path.join("data", "pypi", filename))
def main() -> None: args = argparser.parse_args() number_packages = args.number all_packages = args.all
top_pypi_packages = load_json("top-pypi-packages-365-days") if all_packages: top_pypi_packages = top_pypi_packages["rows"] elif number_packages >= 0 and number_packages <= 4000: top_pypi_packages = top_pypi_packages["rows"][:number_packages] else: raise AssertionError("Unknown value for NUMBER_OF_PACKAGES")
try: os.mkdir(os.path.join("data", "pypi")) except FileExistsError: pass
for package in top_pypi_packages: package_name = package["project"]
print(f"Downloading JSON Data for {package_name}... ", end="") download_package_json(package_name) print("Done")
package_json = load_json(package_name) try: print(f"Downloading and compressing package {package_name} ... ", end="") download_package_code(package_name, package_json) print("Done") except (IndexError, KeyError): print(f"Could not locate source for {package_name}") continue finally: remove_json(package_name)
if __name__ == "__main__": main()
|