From f4a4922222b416ac699f4547f94c048b94e5eba0 Mon Sep 17 00:00:00 2001 From: Rainxch Zed Date: Mon, 29 Dec 2025 11:21:29 +0500 Subject: [PATCH 1/2] refactor(data): Improve caching and trending repo fetching This commit refactors the data layer for fetching and handling cached trending repositories, improving both the client-side parsing and the backend script that generates the data. ### Key Changes: **Data Fetching & Parsing:** - Introduced `CachedGithubRepoSummary` and `CachedGithubOwner` data classes to precisely match the structure of the pre-cached JSON files. This prevents parsing errors if the full `GithubRepoSummary` model contains fields not present in the cached data. - Added a `toGithubRepoSummary()` extension function to map the cached data model to the domain model used in the app. - Enhanced logging in `CachedTrendingDataSource` with more detailed messages for success, failures (404), timeouts, and serialization errors to improve debugging. - Removed the `ContentNegotiation` plugin from the dedicated `HttpClient` in `CachedTrendingDataSource` to handle JSON parsing manually, providing better error handling. **Backend Script (`fetch_trending.py`):** - Implemented a more robust, multi-attempt search strategy to find a sufficient number of relevant repositories. - The script now progressively broadens its search criteria (widening the date range, lowering the star requirement, and eventually dropping topic filters) across multiple attempts if not enough results are found initially. - The desired number of repositories to fetch per platform has been increased from 30 to 80 to provide a richer dataset. - The logic now tracks repositories that have already been checked (in a `seen` set) to avoid redundant API calls. - The final list of repositories is sorted by star count before being saved. **CI/CD (`fetch-trending-repos.yml`):** - The cron schedule for the trending repositories job has been changed from every 6 hours to every 12 hours to reduce build frequency. - The Git commit-and-push logic is simplified to use `git commit || echo "No changes to commit"` to gracefully handle cases where no data has changed, removing the need for a separate check step. --- .github/workflows/fetch-trending-repos.yml | 22 +-- .../data_source/CachedTrendingDataSource.kt | 122 +++++++++--- .../data/repository/HomeRepositoryImpl.kt | 5 +- scripts/fetch_trending.py | 179 ++++++++++-------- scripts/requirements.txt | 2 +- 5 files changed, 203 insertions(+), 127 deletions(-) diff --git a/.github/workflows/fetch-trending-repos.yml b/.github/workflows/fetch-trending-repos.yml index 6d6f8e237..31bb8ad31 100644 --- a/.github/workflows/fetch-trending-repos.yml +++ b/.github/workflows/fetch-trending-repos.yml @@ -2,8 +2,8 @@ name: Fetch Trending Repositories on: schedule: - # Run every 6 hours - - cron: '0 */6 * * *' + # Run every 12 hours + - cron: '0 */12 * * *' workflow_dispatch: # Allow manual triggering jobs: @@ -32,21 +32,11 @@ jobs: run: | python scripts/fetch_trending.py - - name: Check for changes - id: git-check - run: | - git diff --exit-code cached-data/ || echo "changed=true" >> $GITHUB_OUTPUT - - - name: Commit and push if changed - if: steps.git-check.outputs.changed == 'true' + - name: Commit and push changes run: | git config --local user.email "github-actions[bot]@users.noreply.github.com" git config --local user.name "github-actions[bot]" + git pull origin main --rebase git add cached-data/ - git commit -m "Update trending repositories - $(date -u +'%Y-%m-%d %H:%M:%S UTC')" - git push - - - name: No changes detected - if: steps.git-check.outputs.changed != 'true' - run: | - echo "No changes in trending repositories" \ No newline at end of file + git commit -m "Update trending repositories - $(date -u +'%Y-%m-%d %H:%M:%S UTC')" || echo "No changes to commit" + git push \ No newline at end of file diff --git a/composeApp/src/commonMain/kotlin/zed/rainxch/githubstore/feature/home/data/data_source/CachedTrendingDataSource.kt b/composeApp/src/commonMain/kotlin/zed/rainxch/githubstore/feature/home/data/data_source/CachedTrendingDataSource.kt index 76d081039..2ff688df6 100644 --- a/composeApp/src/commonMain/kotlin/zed/rainxch/githubstore/feature/home/data/data_source/CachedTrendingDataSource.kt +++ b/composeApp/src/commonMain/kotlin/zed/rainxch/githubstore/feature/home/data/data_source/CachedTrendingDataSource.kt @@ -1,30 +1,24 @@ package zed.rainxch.githubstore.feature.home.data.data_source -/** - * Data source for fetching pre-cached trending repositories from GitHub - */ import co.touchlab.kermit.Logger -import io.ktor.client.* -import io.ktor.client.call.* -import io.ktor.client.plugins.* -import io.ktor.client.plugins.contentnegotiation.* -import io.ktor.client.request.* -import io.ktor.client.statement.* -import io.ktor.http.* -import io.ktor.serialization.kotlinx.json.* +import io.ktor.client.HttpClient +import io.ktor.client.plugins.HttpRequestRetry +import io.ktor.client.plugins.HttpRequestTimeoutException +import io.ktor.client.plugins.HttpTimeout +import io.ktor.client.request.get +import io.ktor.client.statement.HttpResponse +import io.ktor.client.statement.bodyAsText +import io.ktor.http.isSuccess import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.withContext import kotlinx.serialization.Serializable +import kotlinx.serialization.SerializationException import kotlinx.serialization.json.Json import zed.rainxch.githubstore.core.domain.Platform import zed.rainxch.githubstore.core.domain.model.GithubRepoSummary +import zed.rainxch.githubstore.core.domain.model.GithubUser import zed.rainxch.githubstore.core.domain.model.PlatformType -/** - * Data source for fetching pre-cached trending repositories from GitHub - * Uses a dedicated HTTP client (not the GitHub API client) since this fetches - * static JSON files from raw.githubusercontent.com (no auth or rate limits needed) - */ class CachedTrendingDataSource( private val platform: Platform ) { @@ -34,10 +28,6 @@ class CachedTrendingDataSource( } private val httpClient = HttpClient { - install(ContentNegotiation) { - json(json) - } - install(HttpTimeout) { requestTimeoutMillis = 15_000 connectTimeoutMillis = 10_000 @@ -71,33 +61,47 @@ class CachedTrendingDataSource( val url = "$baseUrl/$platformName.json" - Logger.d { "Fetching cached trending repos from: $url" } + Logger.d { "🔍 Fetching cached trending repos from: $url" } val response: HttpResponse = httpClient.get(url) + Logger.d { "📥 Response status: ${response.status.value} ${response.status.description}" } + when { response.status.isSuccess() -> { - val cachedData = response.body() + val responseText = response.bodyAsText() + Logger.d { "📄 Response body length: ${responseText.length} characters" } + + val cachedData = json.decodeFromString(responseText) Logger.d { "✓ Successfully loaded ${cachedData.repositories.size} cached repos" } - Logger.d { "Last updated: ${cachedData.lastUpdated}" } + Logger.d { "✓ Last updated: ${cachedData.lastUpdated}" } cachedData } response.status.value == 404 -> { - Logger.w { "Cached data not found (404) - may not be generated yet" } + Logger.w { "⚠️ Cached data not found (404) - may not be generated yet" } + Logger.w { "⚠️ URL attempted: $url" } null } else -> { - Logger.e { "Failed to fetch cached repos: HTTP ${response.status.value}" } + val errorBody = response.bodyAsText() + Logger.e { "❌ Failed to fetch cached repos: HTTP ${response.status.value}" } + Logger.e { "❌ Response body: ${errorBody.take(500)}" } null } } } catch (e: HttpRequestTimeoutException) { - Logger.e { "Timeout fetching cached trending repos" } + Logger.e { "⏱️ Timeout fetching cached trending repos: ${e.message}" } + e.printStackTrace() + null + } catch (e: SerializationException) { + Logger.e { "🔧 JSON parsing error: ${e.message}" } + e.printStackTrace() null } catch (e: Exception) { - Logger.e { "Error fetching cached trending repos: ${e.message}" } + Logger.e { "💥 Error fetching cached trending repos: ${e.message}" } + Logger.e { "💥 Exception type: ${e::class.simpleName}" } e.printStackTrace() null } @@ -112,10 +116,70 @@ class CachedTrendingDataSource( } } +/** + * Cached repository data for a specific platform + */ @Serializable data class CachedRepoResponse( val platform: String, val lastUpdated: String, val totalCount: Int, - val repositories: List -) \ No newline at end of file + val repositories: List +) + +/** + * Simplified repo summary for cached data + * Only includes the fields present in the cached JSON files + */ +@Serializable +data class CachedGithubRepoSummary( + val id: Long, + val name: String, + val fullName: String, + val owner: CachedGithubOwner, + val description: String?, + val defaultBranch: String, + val htmlUrl: String, + val stargazersCount: Int, + val forksCount: Int, + val language: String?, + val topics: List?, + val releasesUrl: String, + val updatedAt: String +) + +/** + * Simplified owner data for cached repos + * Only includes login and avatarUrl (not id and htmlUrl) + */ +@Serializable +data class CachedGithubOwner( + val login: String, + val avatarUrl: String +) + +/** + * Extension to convert cached summary to full GithubRepoSummary + */ +fun CachedGithubRepoSummary.toGithubRepoSummary(): GithubRepoSummary { + return GithubRepoSummary( + id = id, + name = name, + fullName = fullName, + owner = GithubUser( + id = 0, + login = owner.login, + avatarUrl = owner.avatarUrl, + htmlUrl = "https://github.com/${owner.login}" + ), + description = description, + defaultBranch = defaultBranch, + htmlUrl = htmlUrl, + stargazersCount = stargazersCount, + forksCount = forksCount, + language = language, + topics = topics, + releasesUrl = releasesUrl, + updatedAt = updatedAt + ) +} \ No newline at end of file diff --git a/composeApp/src/commonMain/kotlin/zed/rainxch/githubstore/feature/home/data/repository/HomeRepositoryImpl.kt b/composeApp/src/commonMain/kotlin/zed/rainxch/githubstore/feature/home/data/repository/HomeRepositoryImpl.kt index 6fb239096..7b45a9e2d 100644 --- a/composeApp/src/commonMain/kotlin/zed/rainxch/githubstore/feature/home/data/repository/HomeRepositoryImpl.kt +++ b/composeApp/src/commonMain/kotlin/zed/rainxch/githubstore/feature/home/data/repository/HomeRepositoryImpl.kt @@ -32,6 +32,7 @@ import zed.rainxch.githubstore.core.data.model.GithubRepoSearchResponse import zed.rainxch.githubstore.core.domain.Platform import zed.rainxch.githubstore.core.domain.model.PlatformType import zed.rainxch.githubstore.feature.home.data.data_source.CachedTrendingDataSource +import zed.rainxch.githubstore.feature.home.data.data_source.toGithubRepoSummary import zed.rainxch.githubstore.feature.home.domain.repository.HomeRepository import zed.rainxch.githubstore.feature.home.domain.model.PaginatedRepos import zed.rainxch.githubstore.network.RateLimitException @@ -57,9 +58,11 @@ class HomeRepositoryImpl( if (cachedData != null && cachedData.repositories.isNotEmpty()) { Logger.d { "Using cached data: ${cachedData.repositories.size} repos" } + val repos = cachedData.repositories.map { it.toGithubRepoSummary() } + emit( PaginatedRepos( - repos = cachedData.repositories, + repos = repos, hasMore = false, nextPageIndex = 2 ) diff --git a/scripts/fetch_trending.py b/scripts/fetch_trending.py index 7c38f6472..9a3706917 100644 --- a/scripts/fetch_trending.py +++ b/scripts/fetch_trending.py @@ -199,115 +199,134 @@ def build_query(base_query: str, topics: List[str]) -> str: if len(topics) == 1: topic_query = f"topic:{topics[0]}" else: - # GitHub search supports OR with spaces between topics topic_parts = [f"topic:{topic}" for topic in topics] topic_query = " OR ".join(topic_parts) topic_query = f"({topic_query})" return f"{base_query} {topic_query}" -def fetch_trending_repos(platform: str, desired_count: int = 30) -> List[Dict]: +def fetch_trending_repos(platform: str, desired_count: int = 80) -> List[Dict]: """Fetch trending repositories for a specific platform""" print(f"\n{'='*60}") print(f"Fetching trending repos for {platform.upper()}") print(f"{'='*60}") - # Calculate date 7 days ago - seven_days_ago = (datetime.utcnow() - timedelta(days=7)).strftime('%Y-%m-%d') - + url = 'https://api.github.com/search/repositories' topics = PLATFORMS[platform]['topics'] - base_query = f'stars:>500 archived:false pushed:>={seven_days_ago}' - query = build_query(base_query, topics) - print(f"Query: {query}") + results: List[Dict] = [] + seen: set = set() + attempt = 0 + max_attempts = 4 + min_count = 10 # Ensure at least this many if possible - results = [] - page = 1 - max_pages = 5 + while len(results) < desired_count and attempt < max_attempts: + attempt += 1 + days = 7 * (2 ** (attempt - 1)) # 7, 14, 28, 56 + stars_min = max(500 // (2 ** (attempt - 1)), 50) # 500, 250, 125, 62 -> min 50 + current_topics = topics if attempt < 3 else [] # Drop topics on later attempts to broaden search - while len(results) < desired_count and page <= max_pages: - print(f"\nFetching API page {page}...") + past_date = (datetime.utcnow() - timedelta(days=days)).strftime('%Y-%m-%d') + base_query = f'stars:>{stars_min} archived:false pushed:>={past_date}' + query = build_query(base_query, current_topics) - url = 'https://api.github.com/search/repositories' - params = { - 'q': query, - 'sort': 'stars', - 'order': 'desc', - 'per_page': 100, - 'page': page - } + print(f"Attempt {attempt}: days={days}, stars>{stars_min}, topics={current_topics or 'none'}") + print(f"Query: {query}") - response, error = make_request_with_retry(url, params=params, timeout=30) + page = 1 + max_pages = 10 # Increased to allow more candidates - if response is None: - print(f"Failed to fetch page {page}: {error}") - break + while len(results) < desired_count and page <= max_pages: + print(f"\nFetching API page {page}...") - try: - data = response.json() - items = data.get('items', []) + params = { + 'q': query, + 'sort': 'stars', + 'order': 'desc', + 'per_page': 100, + 'page': page + } - print(f"Got {len(items)} repositories from API") + response, error = make_request_with_retry(url, params=params, timeout=30) - if not items: + if response is None: + print(f"Failed to fetch page {page}: {error}") break - # Score and filter candidates - candidates = [] - for repo in items: - score = calculate_platform_score(repo, platform) - if score > 0: - candidates.append((repo, score)) - - # Sort by score and take top 50 - candidates.sort(key=lambda x: x[1], reverse=True) - candidates = [repo for repo, _ in candidates[:50]] + try: + data = response.json() + items = data.get('items', []) - print(f"Checking {len(candidates)} candidates for installers...") + print(f"Got {len(items)} repositories from API") - # Check each candidate for installers - for repo in candidates: - if len(results) >= desired_count: + if not items: break - owner = repo['owner']['login'] - name = repo['name'] - - print(f"Checking {owner}/{name}...", end=' ') - - if check_repo_has_installers(owner, name, platform): - # Transform to summary format - summary = { - 'id': repo['id'], - 'name': repo['name'], - 'fullName': repo['full_name'], - 'owner': { - 'login': repo['owner']['login'], - 'avatarUrl': repo['owner']['avatar_url'] - }, - 'description': repo.get('description'), - 'defaultBranch': repo.get('default_branch', 'main'), - 'htmlUrl': repo['html_url'], - 'stargazersCount': repo['stargazers_count'], - 'forksCount': repo['forks_count'], - 'language': repo.get('language'), - 'topics': repo.get('topics', []), - 'releasesUrl': repo['releases_url'], - 'updatedAt': repo['updated_at'] - } - results.append(summary) - print(f"✓ Found ({len(results)}/{desired_count})") - else: - print("✗ No installers") + # Score and filter candidates + candidates = [] + for repo in items: + score = calculate_platform_score(repo, platform) + if score >= 5: + candidates.append((repo, score)) - # Small delay to avoid rate limiting - time.sleep(0.5) + # Sort by score and take top 50 + candidates.sort(key=lambda x: x[1], reverse=True) + candidates = [repo for repo, _ in candidates[:50]] - page += 1 + print(f"Checking {len(candidates)} candidates for installers...") - except Exception as e: - print(f"Error processing page {page}: {e}", file=sys.stderr) - break + # Check each candidate for installers + for repo in candidates: + if len(results) >= desired_count: + break + + full_name = repo['full_name'] + if full_name in seen: + continue + + owner = repo['owner']['login'] + name = repo['name'] + + print(f"Checking {owner}/{name}...", end=' ') + + if check_repo_has_installers(owner, name, platform): + # Transform to summary format + summary = { + 'id': repo['id'], + 'name': repo['name'], + 'fullName': full_name, + 'owner': { + 'login': owner, + 'avatarUrl': repo['owner']['avatar_url'] + }, + 'description': repo.get('description'), + 'defaultBranch': repo.get('default_branch', 'main'), + 'htmlUrl': repo['html_url'], + 'stargazersCount': repo['stargazers_count'], + 'forksCount': repo['forks_count'], + 'language': repo.get('language'), + 'topics': repo.get('topics', []), + 'releasesUrl': repo['releases_url'], + 'updatedAt': repo['updated_at'] + } + results.append(summary) + seen.add(full_name) + print(f"✓ Found ({len(results)}/{desired_count}) {full_name}") + else: + print(f"✗ No installers {full_name}") + + seen.add(full_name) # Add to seen even if no installers to avoid rechecking + time.sleep(0.5) + + page += 1 + + except Exception as e: + print(f"Error processing page {page}: {e}", file=sys.stderr) + break + + # Sort final results by stargazers count descending and truncate to desired count + results.sort(key=lambda x: x['stargazersCount'], reverse=True) + results = results[:desired_count] print(f"\n{'='*60}") print(f"Total found: {len(results)} repositories for {platform}") @@ -322,7 +341,7 @@ def main(): for platform in PLATFORMS.keys(): print(f"\nProcessing {platform}...") - repos = fetch_trending_repos(platform, desired_count=30) + repos = fetch_trending_repos(platform, desired_count=80) output = { 'platform': platform, diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 05a991d5d..23abd5291 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1 +1 @@ -requests>=2.32.4 \ No newline at end of file +requests>=2.32.3 \ No newline at end of file From 81399e430749ae20fa77d36ba31309f00e661210 Mon Sep 17 00:00:00 2001 From: Rainxch Zed Date: Mon, 29 Dec 2025 11:33:37 +0500 Subject: [PATCH 2/2] chore: Bump requests from 2.32.3 to 2.32.4 This commit updates the version of the `requests` library in the project's requirements. --- scripts/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 23abd5291..05a991d5d 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1 +1 @@ -requests>=2.32.3 \ No newline at end of file +requests>=2.32.4 \ No newline at end of file