Skip to content

Commit

Permalink
subtitle update
Browse files Browse the repository at this point in the history
  • Loading branch information
BennyThink committed Dec 5, 2024
1 parent 494a6b9 commit 4897e7f
Show file tree
Hide file tree
Showing 13 changed files with 182 additions and 179 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,4 @@ logs/*
/yyetsweb/templates/dump/yyets_mongo.gz
/yyetsweb/templates/dump/yyets_mysql.zip
/yyetsweb/templates/dump/yyets_sqlite.zip
/yyetsweb/subtitle_data/attachment/201001/17/758231_1263706947i2nW.rar
7 changes: 0 additions & 7 deletions docker-compose.replica.yml

This file was deleted.

20 changes: 4 additions & 16 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ services:
- MEILI_HTTP_PAYLOAD_SIZE_LIMIT=1073741824 #1GiB
volumes:
- meilisearch_data:/meili_data
ports:
- "127.0.0.1:7700:7700"

mysql:
image: ubuntu/mysql:8.0-22.04_beta
Expand All @@ -35,27 +37,11 @@ services:
driver: none
command: "--skip-log-bin --default-authentication-plugin=mysql_native_password"

socat:
image: bennythink/socat
restart: unless-stopped
volumes:
- /var/run/docker.sock:/var/run/docker.sock
entrypoint: [ "socat", "tcp-listen:2375,fork,reuseaddr","unix-connect:/var/run/docker.sock" ]
logging:
driver: none

mailhog:
image: cd2team/mailhog
restart: unless-stopped
logging:
driver: none

bot:
image: bennythink/yyetsbot
depends_on:
- redis
- mongo
- socat
restart: always
env_file:
- env/yyets.env
Expand All @@ -70,6 +56,8 @@ services:
- redis
- mysql
working_dir: /YYeTsBot/yyetsweb/
volumes:
- ./subtitle_data:/YYeTsBot/yyetsweb/subtitle_data
command: [ "python3","server.py","-h=0.0.0.0" ]
ports:
- "127.0.0.1:8888:8888"
Expand Down
19 changes: 19 additions & 0 deletions scripts/migrate_sub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env python3
# coding: utf-8

# YYeTsBot - migrate_sub.py

import pymongo
import pymysql
from pymysql.cursors import DictCursor

con = pymysql.connect(host="mysql", user="root", password="root", database="yyets", charset="utf8")
cur = con.cursor(cursor=DictCursor)
mongo_client = pymongo.MongoClient(host="mongo")
col = mongo_client["zimuzu"]["subtitle"]

cur.execute("select * from subtitle")

# 56134 rows
for sub in cur.fetchall():
col.insert_one(sub)
32 changes: 8 additions & 24 deletions yyetsweb/common/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,7 @@ def sleep(times=1):
class Zhuixinfan(BaseSync):
def run(self):
zhuixinfan = "http://www.fanxinzhui.com/rr/{}"
start = (self.sync.find_one({"name": "zhuixinfan"}) or {}).get(
"resource_id", os.getenv("ZHUIXINFAN_START", 20)
)
start = (self.sync.find_one({"name": "zhuixinfan"}) or {}).get("resource_id", os.getenv("ZHUIXINFAN_START", 20))
end = os.getenv("ZHUIXINFAN_END", 2500)
for i in range(start, end):
url = zhuixinfan.format(i)
Expand Down Expand Up @@ -123,13 +121,9 @@ def build_data(self, html, link):
for item in links:
content = item["href"]
if "ed2k" in content:
resource["files"].append(
{"way": "1", "way_cn": "电驴", "address": content, "passwd": ""}
)
resource["files"].append({"way": "1", "way_cn": "电驴", "address": content, "passwd": ""})
elif "magnet" in content:
resource["files"].append(
{"way": "2", "way_cn": "磁力", "address": content, "passwd": ""}
)
resource["files"].append({"way": "2", "way_cn": "磁力", "address": content, "passwd": ""})
elif "pan.baidu" in content:
baidu_password = res.span.a.nextSibling.nextSibling.text
resource["files"].append(
Expand All @@ -141,9 +135,7 @@ def build_data(self, html, link):
}
)
elif "weiyun" in content:
resource["files"].append(
{"way": "14", "way_cn": "微云", "address": content, "passwd": ""}
)
resource["files"].append({"way": "14", "way_cn": "微云", "address": content, "passwd": ""})
else:
logging.debug("Unknown link: %s", content)

Expand All @@ -167,9 +159,7 @@ def update_yyets(self, data):
)
else:
last_id = 90000
last = self.yyets.find_one(
{"data.info.id": {"$gte": last_id}}, sort=[("data.info.id", -1)]
)
last = self.yyets.find_one({"data.info.id": {"$gte": last_id}}, sort=[("data.info.id", -1)])
if last:
last_id = last["data"]["info"]["id"] + 1
logging.info("Inserting data.info.id: %s", last_id)
Expand Down Expand Up @@ -213,19 +203,13 @@ def run(self):
structure["data"]["info"]["enname"] = data["enname"]
structure["data"]["info"]["aliasname"] = data["aliasname"]
structure["data"]["info"]["channel"] = data["channel"]
structure["data"]["info"]["channel_cn"] = (
data["channel_cn"] or channel_cn
)
structure["data"]["info"]["channel_cn"] = data["channel_cn"] or channel_cn
structure["data"]["info"]["area"] = data["area"]
structure["data"]["list"] = []
structure["data"]["info"][
"source"
] = f"https://www.yysub.net/resource/{i}"
structure["data"]["info"]["source"] = f"https://www.yysub.net/resource/{i}"
self.insert_data(structure.copy())

self.sync.update_one(
{"name": "yysub"}, {"$set": {"resource_id": end}}, upsert=True
)
self.sync.update_one({"name": "yysub"}, {"$set": {"resource_id": end}}, upsert=True)
logging.info("YYsub Finished")

def insert_data(self, data):
Expand Down
21 changes: 21 additions & 0 deletions yyetsweb/databases/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def __init__(self):
self.yyets_index = self.search_client.index("yyets")
self.comment_index = self.search_client.index("comment")
self.douban_index = self.search_client.index("douban")
self.subtitle_index = self.search_client.index("subtitle")
super().__init__()

def __del__(self):
Expand Down Expand Up @@ -152,6 +153,17 @@ def __get_douban(self):
]
)

def __get_subtitle(self):
return self.db["subtitle"].aggregate(
[
{
"$addFields": {
"_id": {"$toString": "$_id"},
}
},
]
)

def add_yyets(self):
logging.info("Adding yyets data to search engine")
data = list(self.__get_yyets())
Expand All @@ -167,6 +179,11 @@ def add_douban(self):
data = list(self.__get_douban())
self.douban_index.add_documents(data, primary_key="_id")

def add_subtitle(self):
logging.info("Adding subtitle data to search engine")
data = list(self.__get_subtitle())
self.subtitle_index.add_documents(data, primary_key="_id")

def search_yyets(self, keyword: "str"):
return self.yyets_index.search(keyword, {"matchingStrategy": "all"})["hits"]

Expand All @@ -176,11 +193,15 @@ def search_comment(self, keyword: "str"):
def search_douban(self, keyword: "str"):
return self.douban_index.search(keyword, {"matchingStrategy": "all"})["hits"]

def search_subtitle(self, keyword: "str"):
return self.subtitle_index.search(keyword, {"matchingStrategy": "all"})["hits"]

def run_import(self):
t0 = time.time()
self.add_yyets()
self.add_comment()
self.add_douban()
self.add_subtitle()
logging.info(f"Import data to search engine in {time.time() - t0:.2f}s")

def __monitor(self, col, fun):
Expand Down
32 changes: 8 additions & 24 deletions yyetsweb/databases/douban.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@ def find_douban(self, resource_id: int):

douban_col = self.db["douban"]
yyets_col = self.db["yyets"]
data = douban_col.find_one(
{"resourceId": resource_id}, {"_id": False, "raw": False}
)
data = douban_col.find_one({"resourceId": resource_id}, {"_id": False, "raw": False})
if data:
logging.info("Existing data for %s", resource_id)
return data
Expand All @@ -58,12 +56,8 @@ def find_douban(self, resource_id: int):
douban_item = soup.find_all("div", class_="content")

fwd_link = unquote(douban_item[0].a["href"])
douban_id = re.findall(
r"https://movie\.douban\.com/subject/(\d*)/.*", fwd_link
)[0]
final_data = self.get_craw_data(
cname, douban_id, resource_id, search_html, session
)
douban_id = re.findall(r"https://movie\.douban\.com/subject/(\d*)/.*", fwd_link)[0]
final_data = self.get_craw_data(cname, douban_id, resource_id, search_html, session)
douban_col.insert_one(final_data.copy())
final_data.pop("raw")
return final_data
Expand All @@ -76,9 +70,7 @@ def get_craw_data(cname, douban_id, resource_id, search_html, session):
soup = BeautifulSoup(detail_html, "html.parser")

directors = [i.text for i in (soup.find_all("a", rel="v:directedBy"))]
release_date = (
poster_image_link
) = rating = year_text = intro = writers = episode_count = episode_duration = ""
release_date = poster_image_link = rating = year_text = intro = writers = episode_count = episode_duration = ""
with contextlib.suppress(IndexError):
episode_duration = soup.find_all("span", property="v:runtime")[0].text
for i in soup.find_all("span", class_="pl"):
Expand All @@ -92,21 +84,15 @@ def get_craw_data(cname, douban_id, resource_id, search_html, session):
genre = [i.text for i in soup.find_all("span", property="v:genre")]

with contextlib.suppress(IndexError):
release_date = soup.find_all("span", property="v:initialReleaseDate")[
0
].text
release_date = soup.find_all("span", property="v:initialReleaseDate")[0].text
with contextlib.suppress(IndexError):
poster_image_link = soup.find_all("div", id="mainpic")[0].a.img["src"]
with contextlib.suppress(IndexError):
rating = soup.find_all("strong", class_="ll rating_num")[0].text
with contextlib.suppress(IndexError):
year_text = re.sub(
r"[()]", "", soup.find_all("span", class_="year")[0].text
)
year_text = re.sub(r"[()]", "", soup.find_all("span", class_="year")[0].text)
with contextlib.suppress(IndexError):
intro = re.sub(
r"\s", "", soup.find_all("span", property="v:summary")[0].text
)
intro = re.sub(r"\s", "", soup.find_all("span", property="v:summary")[0].text)

final_data = {
"name": cname,
Expand Down Expand Up @@ -139,9 +125,7 @@ class DoubanReport(Mongo):
def get_error(self) -> dict:
return dict(data=list(self.db["douban_error"].find(projection={"_id": False})))

def report_error(
self, captcha: str, captcha_id: int, content: str, resource_id: int
) -> dict:
def report_error(self, captcha: str, captcha_id: int, content: str, resource_id: int) -> dict:
returned = {"status_code": 0, "message": ""}
verify_result = Captcha().verify_code(captcha, captcha_id)
if not verify_result["status"]:
Expand Down
Loading

0 comments on commit 4897e7f

Please sign in to comment.