subtitle update

tgbot-collection · Dec 5, 2024 · 4897e7f · 4897e7f
1 parent 494a6b9
commit 4897e7f
Show file tree

Hide file tree

Showing 13 changed files with 182 additions and 179 deletions.
diff --git a/.gitignore b/.gitignore
@@ -146,3 +146,4 @@ logs/*
 /yyetsweb/templates/dump/yyets_mongo.gz
 /yyetsweb/templates/dump/yyets_mysql.zip
 /yyetsweb/templates/dump/yyets_sqlite.zip
+/yyetsweb/subtitle_data/attachment/201001/17/758231_1263706947i2nW.rar
diff --git a/YYeTsFE b/YYeTsFE
diff --git a/docker-compose.replica.yml b/docker-compose.replica.yml
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -25,6 +25,8 @@ services:
       - MEILI_HTTP_PAYLOAD_SIZE_LIMIT=1073741824 #1GiB
     volumes:
       - meilisearch_data:/meili_data
+    ports:
+      - "127.0.0.1:7700:7700"
 
   mysql:
     image: ubuntu/mysql:8.0-22.04_beta
@@ -35,27 +37,11 @@ services:
       driver: none
     command: "--skip-log-bin --default-authentication-plugin=mysql_native_password"
 
-  socat:
-    image: bennythink/socat
-    restart: unless-stopped
-    volumes:
-      - /var/run/docker.sock:/var/run/docker.sock
-    entrypoint: [ "socat", "tcp-listen:2375,fork,reuseaddr","unix-connect:/var/run/docker.sock" ]
-    logging:
-      driver: none
-
-  mailhog:
-    image: cd2team/mailhog
-    restart: unless-stopped
-    logging:
-      driver: none
-
   bot:
     image: bennythink/yyetsbot
     depends_on:
       - redis
       - mongo
-      - socat
     restart: always
     env_file:
       - env/yyets.env
@@ -70,6 +56,8 @@ services:
       - redis
       - mysql
     working_dir: /YYeTsBot/yyetsweb/
+    volumes:
+      - ./subtitle_data:/YYeTsBot/yyetsweb/subtitle_data
     command: [ "python3","server.py","-h=0.0.0.0" ]
     ports:
       - "127.0.0.1:8888:8888"

diff --git a/scripts/migrate_sub.py b/scripts/migrate_sub.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+# coding: utf-8
+
+# YYeTsBot - migrate_sub.py
+
+import pymongo
+import pymysql
+from pymysql.cursors import DictCursor
+
+con = pymysql.connect(host="mysql", user="root", password="root", database="yyets", charset="utf8")
+cur = con.cursor(cursor=DictCursor)
+mongo_client = pymongo.MongoClient(host="mongo")
+col = mongo_client["zimuzu"]["subtitle"]
+
+cur.execute("select * from subtitle")
+
+# 56134 rows
+for sub in cur.fetchall():
+    col.insert_one(sub)
diff --git a/yyetsweb/common/sync.py b/yyetsweb/common/sync.py
@@ -65,9 +65,7 @@ def sleep(times=1):
 class Zhuixinfan(BaseSync):
     def run(self):
         zhuixinfan = "http://www.fanxinzhui.com/rr/{}"
-        start = (self.sync.find_one({"name": "zhuixinfan"}) or {}).get(
-            "resource_id", os.getenv("ZHUIXINFAN_START", 20)
-        )
+        start = (self.sync.find_one({"name": "zhuixinfan"}) or {}).get("resource_id", os.getenv("ZHUIXINFAN_START", 20))
         end = os.getenv("ZHUIXINFAN_END", 2500)
         for i in range(start, end):
             url = zhuixinfan.format(i)
@@ -123,13 +121,9 @@ def build_data(self, html, link):
             for item in links:
                 content = item["href"]
                 if "ed2k" in content:
-                    resource["files"].append(
-                        {"way": "1", "way_cn": "电驴", "address": content, "passwd": ""}
-                    )
+                    resource["files"].append({"way": "1", "way_cn": "电驴", "address": content, "passwd": ""})
                 elif "magnet" in content:
-                    resource["files"].append(
-                        {"way": "2", "way_cn": "磁力", "address": content, "passwd": ""}
-                    )
+                    resource["files"].append({"way": "2", "way_cn": "磁力", "address": content, "passwd": ""})
                 elif "pan.baidu" in content:
                     baidu_password = res.span.a.nextSibling.nextSibling.text
                     resource["files"].append(
@@ -141,9 +135,7 @@ def build_data(self, html, link):
                         }
                     )
                 elif "weiyun" in content:
-                    resource["files"].append(
-                        {"way": "14", "way_cn": "微云", "address": content, "passwd": ""}
-                    )
+                    resource["files"].append({"way": "14", "way_cn": "微云", "address": content, "passwd": ""})
                 else:
                     logging.debug("Unknown link: %s", content)
 
@@ -167,9 +159,7 @@ def update_yyets(self, data):
             )
         else:
             last_id = 90000
-            last = self.yyets.find_one(
-                {"data.info.id": {"$gte": last_id}}, sort=[("data.info.id", -1)]
-            )
+            last = self.yyets.find_one({"data.info.id": {"$gte": last_id}}, sort=[("data.info.id", -1)])
             if last:
                 last_id = last["data"]["info"]["id"] + 1
             logging.info("Inserting data.info.id: %s", last_id)
@@ -213,19 +203,13 @@ def run(self):
                 structure["data"]["info"]["enname"] = data["enname"]
                 structure["data"]["info"]["aliasname"] = data["aliasname"]
                 structure["data"]["info"]["channel"] = data["channel"]
-                structure["data"]["info"]["channel_cn"] = (
-                    data["channel_cn"] or channel_cn
-                )
+                structure["data"]["info"]["channel_cn"] = data["channel_cn"] or channel_cn
                 structure["data"]["info"]["area"] = data["area"]
                 structure["data"]["list"] = []
-                structure["data"]["info"][
-                    "source"
-                ] = f"https://www.yysub.net/resource/{i}"
+                structure["data"]["info"]["source"] = f"https://www.yysub.net/resource/{i}"
                 self.insert_data(structure.copy())
 
-        self.sync.update_one(
-            {"name": "yysub"}, {"$set": {"resource_id": end}}, upsert=True
-        )
+        self.sync.update_one({"name": "yysub"}, {"$set": {"resource_id": end}}, upsert=True)
         logging.info("YYsub Finished")
 
     def insert_data(self, data):

diff --git a/yyetsweb/databases/base.py b/yyetsweb/databases/base.py
@@ -113,6 +113,7 @@ def __init__(self):
         self.yyets_index = self.search_client.index("yyets")
         self.comment_index = self.search_client.index("comment")
         self.douban_index = self.search_client.index("douban")
+        self.subtitle_index = self.search_client.index("subtitle")
         super().__init__()
 
     def __del__(self):
@@ -152,6 +153,17 @@ def __get_douban(self):
             ]
         )
 
+    def __get_subtitle(self):
+        return self.db["subtitle"].aggregate(
+            [
+                {
+                    "$addFields": {
+                        "_id": {"$toString": "$_id"},
+                    }
+                },
+            ]
+        )
+
     def add_yyets(self):
         logging.info("Adding yyets data to search engine")
         data = list(self.__get_yyets())
@@ -167,6 +179,11 @@ def add_douban(self):
         data = list(self.__get_douban())
         self.douban_index.add_documents(data, primary_key="_id")
 
+    def add_subtitle(self):
+        logging.info("Adding subtitle data to search engine")
+        data = list(self.__get_subtitle())
+        self.subtitle_index.add_documents(data, primary_key="_id")
+
     def search_yyets(self, keyword: "str"):
         return self.yyets_index.search(keyword, {"matchingStrategy": "all"})["hits"]
 
@@ -176,11 +193,15 @@ def search_comment(self, keyword: "str"):
     def search_douban(self, keyword: "str"):
         return self.douban_index.search(keyword, {"matchingStrategy": "all"})["hits"]
 
+    def search_subtitle(self, keyword: "str"):
+        return self.subtitle_index.search(keyword, {"matchingStrategy": "all"})["hits"]
+
     def run_import(self):
         t0 = time.time()
         self.add_yyets()
         self.add_comment()
         self.add_douban()
+        self.add_subtitle()
         logging.info(f"Import data to search engine in {time.time() - t0:.2f}s")
 
     def __monitor(self, col, fun):

diff --git a/yyetsweb/databases/douban.py b/yyetsweb/databases/douban.py
@@ -33,9 +33,7 @@ def find_douban(self, resource_id: int):
 
         douban_col = self.db["douban"]
         yyets_col = self.db["yyets"]
-        data = douban_col.find_one(
-            {"resourceId": resource_id}, {"_id": False, "raw": False}
-        )
+        data = douban_col.find_one({"resourceId": resource_id}, {"_id": False, "raw": False})
         if data:
             logging.info("Existing data for %s", resource_id)
             return data
@@ -58,12 +56,8 @@ def find_douban(self, resource_id: int):
         douban_item = soup.find_all("div", class_="content")
 
         fwd_link = unquote(douban_item[0].a["href"])
-        douban_id = re.findall(
-            r"https://movie\.douban\.com/subject/(\d*)/.*", fwd_link
-        )[0]
-        final_data = self.get_craw_data(
-            cname, douban_id, resource_id, search_html, session
-        )
+        douban_id = re.findall(r"https://movie\.douban\.com/subject/(\d*)/.*", fwd_link)[0]
+        final_data = self.get_craw_data(cname, douban_id, resource_id, search_html, session)
         douban_col.insert_one(final_data.copy())
         final_data.pop("raw")
         return final_data
@@ -76,9 +70,7 @@ def get_craw_data(cname, douban_id, resource_id, search_html, session):
         soup = BeautifulSoup(detail_html, "html.parser")
 
         directors = [i.text for i in (soup.find_all("a", rel="v:directedBy"))]
-        release_date = (
-            poster_image_link
-        ) = rating = year_text = intro = writers = episode_count = episode_duration = ""
+        release_date = poster_image_link = rating = year_text = intro = writers = episode_count = episode_duration = ""
         with contextlib.suppress(IndexError):
             episode_duration = soup.find_all("span", property="v:runtime")[0].text
         for i in soup.find_all("span", class_="pl"):
@@ -92,21 +84,15 @@ def get_craw_data(cname, douban_id, resource_id, search_html, session):
         genre = [i.text for i in soup.find_all("span", property="v:genre")]
 
         with contextlib.suppress(IndexError):
-            release_date = soup.find_all("span", property="v:initialReleaseDate")[
-                0
-            ].text
+            release_date = soup.find_all("span", property="v:initialReleaseDate")[0].text
         with contextlib.suppress(IndexError):
             poster_image_link = soup.find_all("div", id="mainpic")[0].a.img["src"]
         with contextlib.suppress(IndexError):
             rating = soup.find_all("strong", class_="ll rating_num")[0].text
         with contextlib.suppress(IndexError):
-            year_text = re.sub(
-                r"[()]", "", soup.find_all("span", class_="year")[0].text
-            )
+            year_text = re.sub(r"[()]", "", soup.find_all("span", class_="year")[0].text)
         with contextlib.suppress(IndexError):
-            intro = re.sub(
-                r"\s", "", soup.find_all("span", property="v:summary")[0].text
-            )
+            intro = re.sub(r"\s", "", soup.find_all("span", property="v:summary")[0].text)
 
         final_data = {
             "name": cname,
@@ -139,9 +125,7 @@ class DoubanReport(Mongo):
     def get_error(self) -> dict:
         return dict(data=list(self.db["douban_error"].find(projection={"_id": False})))
 
-    def report_error(
-        self, captcha: str, captcha_id: int, content: str, resource_id: int
-    ) -> dict:
+    def report_error(self, captcha: str, captcha_id: int, content: str, resource_id: int) -> dict:
         returned = {"status_code": 0, "message": ""}
         verify_result = Captcha().verify_code(captcha, captcha_id)
         if not verify_result["status"]:
+2 −2		package.json
+25 −8		src/API/search.ts
+2 −4		src/app/pages/help/index.tsx
+2 −2		src/app/pages/search/CommentDrawer.tsx
+91 −79		src/app/pages/search/SearchList.tsx
+165 −0		src/app/pages/search/SubtitleDrawer.tsx
+16 −9		src/app/pages/search/index.tsx
+10 −41		yarn.lock