From d0b4e91c36d10728530b0670e0fe65bb67257d59 Mon Sep 17 00:00:00 2001 From: gabriel Date: Sat, 6 May 2023 19:58:29 -0300 Subject: [PATCH 1/6] update requirements --- requirements.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d779dbe..9374e29 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ bs4 -requests \ No newline at end of file +requests +tqdm +selenium \ No newline at end of file From bb594f2ce174cba0890b945a58c66d9fe49070b2 Mon Sep 17 00:00:00 2001 From: gabriel Date: Sat, 6 May 2023 22:37:59 -0300 Subject: [PATCH 2/6] update __init__, added more functions to import --- baka_tsuki/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/baka_tsuki/__init__.py b/baka_tsuki/__init__.py index b3ec5e7..9129cea 100644 --- a/baka_tsuki/__init__.py +++ b/baka_tsuki/__init__.py @@ -1,2 +1,2 @@ -from .getter import get_soup -from .searcher import get_links_light_novels_by_language +from .getter import get_soup, get_list_novels +from .searcher import get_links_light_novels_by_language \ No newline at end of file From 3d4f0d795a733e777cb56d19ea337ef88404fe42 Mon Sep 17 00:00:00 2001 From: gabriel Date: Sat, 6 May 2023 22:39:18 -0300 Subject: [PATCH 3/6] update .gitignore, added more files for ignore, vscode, json, selenium files, html --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 92d76df..c80ba09 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ __pycache__/ test.py -.venv \ No newline at end of file +.venv +backlog.html +*.json +.vscode/ +geckodriver.log \ No newline at end of file From 4af288b54859d02349d9d6d8fe2fafb4554b48d6 Mon Sep 17 00:00:00 2001 From: gabriel Date: Sat, 6 May 2023 22:40:49 -0300 Subject: [PATCH 4/6] update main, added more functions in modules --- main.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 7e67156..163b5ad 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,13 @@ import baka_tsuki +import time + +category = 'https://www.baka-tsuki.org/project/index.php?title=Category:Light_novel' if __name__ == '__main__': - soup = baka_tsuki.get_soup() - list_novels_by_language = baka_tsuki.get_links_light_novels_by_language(soup) \ No newline at end of file + start_time = time.time() + + soup = baka_tsuki.get_soup(category) + list_novels_by_language = baka_tsuki.get_links_light_novels_by_language(soup=soup) + baka_tsuki.get_list_novels(list_novels_by_language=list_novels_by_language) + + print("--- %s seconds ---" % (time.time() - start_time)) \ No newline at end of file From 4050329122ca851fcfa3f815aa8ae37527bfaab3 Mon Sep 17 00:00:00 2001 From: gabriel Date: Sat, 6 May 2023 22:42:33 -0300 Subject: [PATCH 5/6] update searcher, added dictionary in func create_list_novels_by_language --- baka_tsuki/searcher.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/baka_tsuki/searcher.py b/baka_tsuki/searcher.py index e9e53d3..287b593 100644 --- a/baka_tsuki/searcher.py +++ b/baka_tsuki/searcher.py @@ -34,7 +34,10 @@ def create_list_novels_by_language(links_novels_by_language:list[BeautifulSoup]) language = parse_language_in_string(tag_a.text) list_novels_by_language.append( - {language:link} + { + 'language':language, + 'link':link + } ) return list_novels_by_language From 95e33ce4e5ff1a19c9b7c76accad6e79b6c7a541 Mon Sep 17 00:00:00 2001 From: gabriel Date: Sat, 6 May 2023 22:48:09 -0300 Subject: [PATCH 6/6] update getter added func get_list_novels that request and write the json --- baka_tsuki/getter.py | 167 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 160 insertions(+), 7 deletions(-) diff --git a/baka_tsuki/getter.py b/baka_tsuki/getter.py index 8dd4707..1c9cc37 100644 --- a/baka_tsuki/getter.py +++ b/baka_tsuki/getter.py @@ -1,16 +1,169 @@ import requests from bs4 import BeautifulSoup +import json +from tqdm import tqdm +from selenium import webdriver +from selenium.webdriver.firefox.options import Options -def get_soup(): - - baka = 'https://www.baka-tsuki.org/project/index.php?title=Category:Light_novel' +''' +progress_bar = tqdm(total=len(list_novels_by_language),colour='blue',desc=f'updating {language}') +''' + +def get_soup(link): - response = requests.get(baka) + response = requests.get(link) if response.content: content = response.content else: - raise Exception("Sorry, bad request") - + raise Exception(f"Sorry, bad request status_code: {0}".format(response)) + soup = BeautifulSoup(content, 'html.parser') - return soup \ No newline at end of file + return soup + +def get_list_novels(list_novels_by_language:list[dict]): + domain = 'https://www.baka-tsuki.org' + + def parse_title_novel(title:str, language:str): + title = title.replace('~','') + title = title.replace(language,'') + title = title.strip() + + return title + + def get_novel_details(link_novel:str): + soup = get_soup(link_novel) + content = soup.find('div', attrs={'id':'mw-content-text'}) + + def get_main_cover(link_novel:str): + options = Options() + options.add_argument("--headless") + browser = webdriver.Firefox(options=options) + browser.get(link_novel) + html = browser.page_source + browser.close() + + # html = link_novel + + soup = BeautifulSoup(html, 'html.parser') + image = soup.find('img', attrs={'class':'thumbimage'}) + try: + image = image['src'] + image = domain+str(image) + except: + image = 'Unknown' + return image + + def get_synopsis(content:BeautifulSoup): + synopsis = content.find_all(lambda tag: tag.name == 'p' + and len(list(tag.contents)) == 1 + and tag.string != None) + desc = '' + for i in synopsis: + desc += i.text.encode('utf-8') + synopsis = desc.strip() + + return synopsis + + def get_ilustrations(chapters_links:list[str]): + img_list = [] + + def is_ilustration_page(page:str): + if domain in page: + soup = get_soup(page) + try: + galery = soup.find('ul', attrs={'class':'gallery mw-gallery-traditional'}) + if galery is not None: + return True + except: + return False + return False + + for chapter in chapters_links: + if is_ilustration_page(chapter): + galery = soup.find('ul', attrs={'class':'gallery mw-gallery-traditional'}) + imgs = galery.find_all('img') + for img in imgs: + img_list.append(img['src']) + + return img_list + + + def get_novel_chapthers(content:BeautifulSoup): + chapters = [] + + def is_valid_chapter(tag): + links = tag.find_all('a') + for link in links: + if domain not in link['href']: + return False + return True + + dl_list = content.find_all('dl') + dl_list = list(filter(is_valid_chapter, dl_list)) + + for tag in dl_list: + links = tag.find_all('a') + for link in links: + chapters.append(link['href']) + + return chapters + + synopsis = get_synopsis(content) + cover = get_main_cover(link_novel) + chapters_links = get_novel_chapthers(content) + image_list = get_ilustrations(chapters_links) + + return synopsis, cover, chapters_links, image_list + + def get_list_of_novels_by_language(soup:BeautifulSoup, language:str): + + list_novels_by_language = [] + div_links_novels = soup.find('div', attrs={'class':'mw-content-ltr'}) + divs_novels_by_letter = div_links_novels.find_all('div', attrs={'class':'mw-category-group'}) + + for div in divs_novels_by_letter: + category_letter = div.find('h3').text.strip().lower() + list_novels_by_category = div.find_all('a') + for tag_a in list_novels_by_category: + link = domain+tag_a['href'] + title = tag_a.text + title = parse_title_novel(title, language) + synopsis, cover, chapters_links, image_list = get_novel_details(link) + list_novels_by_language.append( + { + 'category_letter':category_letter, + 'title':title, + 'link':link, + 'synopsis':synopsis, + 'cover':cover, + 'chapters_links':chapters_links, + 'image_list':image_list + } + ) + + return list_novels_by_language + + def get_novels_with_language(list_novels_by_language:list[dict]): + + list_novels = [] + for dct_novel in list_novels_by_language: + language = dct_novel['language'] + link = dct_novel['link'] + + link_novels_by_language = domain + link + soup = get_soup(link_novels_by_language) + list_novels_by_language = get_list_of_novels_by_language(soup, language) + list_novels.append({ + 'language':language, + 'list_novels_by_language':list_novels_by_language + }) + + return list_novels + + def create_json_novel(list_novels:list[dict]): + with open('./.novels.json', 'w') as file: + json.dump(list_novels, file, indent=4, ) + + list_novels = get_novels_with_language(list_novels_by_language) + create_json_novel(list_novels)