Improve GitHub web scraping error handling and code organization

2026-01-27 10:19:49 +00:00 · 2024-12-27 16:37:20 +07:00
parent 7fca30a23c
commit 6045068c40
1 changed files with 42 additions and 22 deletions
--- a/Scripts/github.py
+++ b/Scripts/github.py
@@ -12,9 +12,11 @@ class Github:
        
    def get_latest_commit(self, owner, repo, branch="main"):
        url = "https://github.com/{}/{}/commits/{}".format(owner, repo, branch)
-
        response = self.fetcher.fetch_and_parse_content(url)

+        if not response:
+            raise ValueError("Failed to fetch commit information from GitHub.")
+
        for line in response.splitlines():
            if "href=\"" in line and "/commit/" in line and "title=\"" in line:
                sha = line.split("href=\"", 1)[1].split("\"", 1)[0].split("/commit/")[-1]
@@ -34,43 +36,61 @@ class Github:
        url = "https://github.com/{}/{}/releases".format(owner, repo)
        response = self.fetcher.fetch_and_parse_content(url)

-        body = ""
-        tag_name = None
-        assets = []
-        
-        for line in response.splitlines():
-            if "<a" in line and "href=\"" in line and "/releases/tag/" in line and not tag_name:
-                tag_name = line.split("/releases/tag/")[1].split("\"")[0]
-            elif "<div" in line and "body-content" in line:
-                body = response.split(line.split(">", 1)[0], 1)[1].split("</div>", 1)[0][1:]
-                break
+        if not response:
+            raise ValueError("Failed to fetch release information from GitHub.")
+
+        tag_name = self._extract_tag_name(response)
+        body = self._extract_body_content(response)

        release_tag_url = "https://github.com/{}/{}/releases/expanded_assets/{}".format(owner, repo, tag_name)
        response = self.fetcher.fetch_and_parse_content(release_tag_url)

+        if not response:
+            raise ValueError("Failed to fetch expanded assets information from GitHub.")
+
+        assets = self._extract_assets(response)
+
+        return {
+            "body": body,
+            "assets": assets
+        }
+
+    def _extract_tag_name(self, response):
+        for line in response.splitlines():
+            if "<a" in line and "href=\"" in line and "/releases/tag/" in line:
+                return line.split("/releases/tag/")[1].split("\"")[0]
+        return None
+
+    def _extract_body_content(self, response):
+        for line in response.splitlines():
+            if "<div" in line and "body-content" in line:
+                return response.split(line.split(">", 1)[0], 1)[1].split("</div>", 1)[0][1:]
+        return ""
+
+    def _extract_assets(self, response):
+        assets = []
+
        for line in response.splitlines():
            if "<a" in line and "href=\"" in line and "/releases/download" in line:
                download_link = line.split("href=\"", 1)[1].split("\"", 1)[0]

                if "tlwm" in download_link or ("tlwm" not in download_link and "DEBUG" not in download_link.upper()):
                    asset_data = response.split(line)[1].split("</div>", 2)[1]
-
-                    try:
-                        asset_id = "".join(char for char in asset_data.split("datetime=\"")[-1].split("\"")[0][::-1] if char.isdigit())[:9]
-                    except:
-                        asset_id = "".join(random.choices('0123456789', k=9))
-
+                    asset_id = self._generate_asset_id(asset_data)
                    assets.append({
                        "product_name": self.extract_asset_name(download_link.split("/")[-1]), 
                        "id": int(asset_id), 
                        "url": "https://github.com" + download_link
                    })

-        return {
-            "body": body,
-            "assets": assets
-        }
-    
+        return assets
+
+    def _generate_asset_id(self, asset_data):
+        try:
+            return "".join(char for char in asset_data.split("datetime=\"")[-1].split("\"")[0][::-1] if char.isdigit())[:9]
+        except:
+            return "".join(random.choices('0123456789', k=9))
+
    def extract_asset_name(self, file_name):
        end_idx = len(file_name)
        if "-" in file_name: