Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pocketflow>=0.0.1
pyyaml>=6.0
requests>=2.28.0
gitpython>=3.1.0
google-cloud-aiplatform>=1.25.0
google-genai>=1.9.0
107 changes: 100 additions & 7 deletions utils/crawl_github_files.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import requests
import base64
import os
import tempfile
import git
import time
import fnmatch
from typing import Union, Set, List, Dict, Tuple, Any
Expand All @@ -16,18 +18,21 @@ def crawl_github_files(
):
"""
Crawl files from a specific path in a GitHub repository at a specific commit.

Args:
repo_url (str): URL of the GitHub repository with specific path and commit
(e.g., 'https://github.com/microsoft/autogen/tree/e45a15766746d95f8cfaaa705b0371267bec812e/python/packages/autogen-core/src/autogen_core')
token (str, optional): GitHub personal access token. Required for private repositories and recommended for public repos to avoid rate limits.
token (str, optional): **GitHub personal access token.**
- **Required for private repositories.**
- **Recommended for public repos to avoid rate limits.**
- Can be passed explicitly or set via the `GITHUB_TOKEN` environment variable.
max_file_size (int, optional): Maximum file size in bytes to download (default: 1 MB)
use_relative_paths (bool, optional): If True, file paths will be relative to the specified subdirectory
include_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to include (e.g., "*.py", {"*.md", "*.txt"}).
If None, all files are included.
exclude_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to exclude.
If None, no files are excluded.

Returns:
dict: Dictionary with files and statistics
"""
Expand All @@ -36,7 +41,89 @@ def crawl_github_files(
include_patterns = {include_patterns}
if exclude_patterns and isinstance(exclude_patterns, str):
exclude_patterns = {exclude_patterns}


def should_include_file(file_path: str, file_name: str) -> bool:
"""Determine if a file should be included based on patterns"""
# If no include patterns are specified, include all files
if not include_patterns:
include_file = True
else:
# Check if file matches any include pattern
include_file = any(fnmatch.fnmatch(file_name, pattern) for pattern in include_patterns)

# If exclude patterns are specified, check if file should be excluded
if exclude_patterns and include_file:
# Exclude if file matches any exclude pattern
exclude_file = any(fnmatch.fnmatch(file_path, pattern) for pattern in exclude_patterns)
return not exclude_file

return include_file

# Detect SSH URL (git@ or .git suffix)
is_ssh_url = repo_url.startswith("git@") or repo_url.endswith(".git")

if is_ssh_url:
# Clone repo via SSH to temp dir
with tempfile.TemporaryDirectory() as tmpdirname:
print(f"Cloning SSH repo {repo_url} to temp dir {tmpdirname} ...")
try:
repo = git.Repo.clone_from(repo_url, tmpdirname)
except Exception as e:
print(f"Error cloning repo: {e}")
return {"files": {}, "stats": {"error": str(e)}}

# Attempt to checkout specific commit/branch if in URL
# Parse ref and subdir from SSH URL? SSH URLs don't have branch info embedded
# So rely on default branch, or user can checkout manually later
# Optionally, user can pass ref explicitly in future API

# Walk directory
files = {}
skipped_files = []

for root, dirs, filenames in os.walk(tmpdirname):
for filename in filenames:
abs_path = os.path.join(root, filename)
rel_path = os.path.relpath(abs_path, tmpdirname)

# Check file size
try:
file_size = os.path.getsize(abs_path)
except OSError:
continue

if file_size > max_file_size:
skipped_files.append((rel_path, file_size))
print(f"Skipping {rel_path}: size {file_size} exceeds limit {max_file_size}")
continue

# Check include/exclude patterns
if not should_include_file(rel_path, filename):
print(f"Skipping {rel_path}: does not match include/exclude patterns")
continue

# Read content
try:
with open(abs_path, "r", encoding="utf-8") as f:
content = f.read()
files[rel_path] = content
print(f"Added {rel_path} ({file_size} bytes)")
except Exception as e:
print(f"Failed to read {rel_path}: {e}")

return {
"files": files,
"stats": {
"downloaded_count": len(files),
"skipped_count": len(skipped_files),
"skipped_files": skipped_files,
"base_path": None,
"include_patterns": include_patterns,
"exclude_patterns": exclude_patterns,
"source": "ssh_clone"
}
}

# Parse GitHub URL to extract owner, repo, commit/branch, and path
parsed_url = urlparse(repo_url)
path_parts = parsed_url.path.strip('/').split('/')
Expand Down Expand Up @@ -101,9 +188,11 @@ def fetch_contents(path):

if response.status_code == 404:
if not token:
print(f"Error 404: Repository not found or is private. If this is a private repository, you need to provide a token.")
print(f"Error 404: Repository not found or is private.\n"
f"If this is a private repository, please provide a valid GitHub token via the 'token' argument or set the GITHUB_TOKEN environment variable.")
else:
print(f"Error 404: Path '{path}' not found in repository or insufficient permissions.")
print(f"Error 404: Path '{path}' not found in repository or insufficient permissions with the provided token.\n"
f"Please verify the token has access to this repository and the path exists.")
return

if response.status_code != 200:
Expand Down Expand Up @@ -201,8 +290,12 @@ def fetch_contents(path):

# Example usage
if __name__ == "__main__":
# Get token from environment variable (more secure than hardcoding)
# Get token from environment variable (recommended for private repos)
github_token = os.environ.get("GITHUB_TOKEN")
if not github_token:
print("Warning: No GitHub token found in environment variable 'GITHUB_TOKEN'.\n"
"Private repositories will not be accessible without a token.\n"
"To access private repos, set the environment variable or pass the token explicitly.")

repo_url = "https://github.com/pydantic/pydantic/tree/6c38dc93f40a47f4d1350adca9ec0d72502e223f/pydantic"

Expand Down