diff --git a/.gitignore b/.gitignore index 92d76df..c80ba09 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ __pycache__/ test.py -.venv \ No newline at end of file +.venv +backlog.html +*.json +.vscode/ +geckodriver.log \ No newline at end of file diff --git a/baka_tsuki/__init__.py b/baka_tsuki/__init__.py index b3ec5e7..9129cea 100644 --- a/baka_tsuki/__init__.py +++ b/baka_tsuki/__init__.py @@ -1,2 +1,2 @@ -from .getter import get_soup -from .searcher import get_links_light_novels_by_language +from .getter import get_soup, get_list_novels +from .searcher import get_links_light_novels_by_language \ No newline at end of file diff --git a/baka_tsuki/getter.py b/baka_tsuki/getter.py index 8dd4707..1c9cc37 100644 --- a/baka_tsuki/getter.py +++ b/baka_tsuki/getter.py @@ -1,16 +1,169 @@ import requests from bs4 import BeautifulSoup +import json +from tqdm import tqdm +from selenium import webdriver +from selenium.webdriver.firefox.options import Options -def get_soup(): - - baka = 'https://www.baka-tsuki.org/project/index.php?title=Category:Light_novel' +''' +progress_bar = tqdm(total=len(list_novels_by_language),colour='blue',desc=f'updating {language}') +''' + +def get_soup(link): - response = requests.get(baka) + response = requests.get(link) if response.content: content = response.content else: - raise Exception("Sorry, bad request") - + raise Exception(f"Sorry, bad request status_code: {0}".format(response)) + soup = BeautifulSoup(content, 'html.parser') - return soup \ No newline at end of file + return soup + +def get_list_novels(list_novels_by_language:list[dict]): + domain = 'https://www.baka-tsuki.org' + + def parse_title_novel(title:str, language:str): + title = title.replace('~','') + title = title.replace(language,'') + title = title.strip() + + return title + + def get_novel_details(link_novel:str): + soup = get_soup(link_novel) + content = soup.find('div', attrs={'id':'mw-content-text'}) + + def get_main_cover(link_novel:str): + options = Options() + options.add_argument("--headless") + browser = webdriver.Firefox(options=options) + browser.get(link_novel) + html = browser.page_source + browser.close() + + # html = link_novel + + soup = BeautifulSoup(html, 'html.parser') + image = soup.find('img', attrs={'class':'thumbimage'}) + try: + image = image['src'] + image = domain+str(image) + except: + image = 'Unknown' + return image + + def get_synopsis(content:BeautifulSoup): + synopsis = content.find_all(lambda tag: tag.name == 'p' + and len(list(tag.contents)) == 1 + and tag.string != None) + desc = '' + for i in synopsis: + desc += i.text.encode('utf-8') + synopsis = desc.strip() + + return synopsis + + def get_ilustrations(chapters_links:list[str]): + img_list = [] + + def is_ilustration_page(page:str): + if domain in page: + soup = get_soup(page) + try: + galery = soup.find('ul', attrs={'class':'gallery mw-gallery-traditional'}) + if galery is not None: + return True + except: + return False + return False + + for chapter in chapters_links: + if is_ilustration_page(chapter): + galery = soup.find('ul', attrs={'class':'gallery mw-gallery-traditional'}) + imgs = galery.find_all('img') + for img in imgs: + img_list.append(img['src']) + + return img_list + + + def get_novel_chapthers(content:BeautifulSoup): + chapters = [] + + def is_valid_chapter(tag): + links = tag.find_all('a') + for link in links: + if domain not in link['href']: + return False + return True + + dl_list = content.find_all('dl') + dl_list = list(filter(is_valid_chapter, dl_list)) + + for tag in dl_list: + links = tag.find_all('a') + for link in links: + chapters.append(link['href']) + + return chapters + + synopsis = get_synopsis(content) + cover = get_main_cover(link_novel) + chapters_links = get_novel_chapthers(content) + image_list = get_ilustrations(chapters_links) + + return synopsis, cover, chapters_links, image_list + + def get_list_of_novels_by_language(soup:BeautifulSoup, language:str): + + list_novels_by_language = [] + div_links_novels = soup.find('div', attrs={'class':'mw-content-ltr'}) + divs_novels_by_letter = div_links_novels.find_all('div', attrs={'class':'mw-category-group'}) + + for div in divs_novels_by_letter: + category_letter = div.find('h3').text.strip().lower() + list_novels_by_category = div.find_all('a') + for tag_a in list_novels_by_category: + link = domain+tag_a['href'] + title = tag_a.text + title = parse_title_novel(title, language) + synopsis, cover, chapters_links, image_list = get_novel_details(link) + list_novels_by_language.append( + { + 'category_letter':category_letter, + 'title':title, + 'link':link, + 'synopsis':synopsis, + 'cover':cover, + 'chapters_links':chapters_links, + 'image_list':image_list + } + ) + + return list_novels_by_language + + def get_novels_with_language(list_novels_by_language:list[dict]): + + list_novels = [] + for dct_novel in list_novels_by_language: + language = dct_novel['language'] + link = dct_novel['link'] + + link_novels_by_language = domain + link + soup = get_soup(link_novels_by_language) + list_novels_by_language = get_list_of_novels_by_language(soup, language) + list_novels.append({ + 'language':language, + 'list_novels_by_language':list_novels_by_language + }) + + return list_novels + + def create_json_novel(list_novels:list[dict]): + with open('./.novels.json', 'w') as file: + json.dump(list_novels, file, indent=4, ) + + list_novels = get_novels_with_language(list_novels_by_language) + create_json_novel(list_novels) diff --git a/baka_tsuki/searcher.py b/baka_tsuki/searcher.py index e9e53d3..287b593 100644 --- a/baka_tsuki/searcher.py +++ b/baka_tsuki/searcher.py @@ -34,7 +34,10 @@ def create_list_novels_by_language(links_novels_by_language:list[BeautifulSoup]) language = parse_language_in_string(tag_a.text) list_novels_by_language.append( - {language:link} + { + 'language':language, + 'link':link + } ) return list_novels_by_language diff --git a/main.py b/main.py index 7e67156..163b5ad 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,13 @@ import baka_tsuki +import time + +category = 'https://www.baka-tsuki.org/project/index.php?title=Category:Light_novel' if __name__ == '__main__': - soup = baka_tsuki.get_soup() - list_novels_by_language = baka_tsuki.get_links_light_novels_by_language(soup) \ No newline at end of file + start_time = time.time() + + soup = baka_tsuki.get_soup(category) + list_novels_by_language = baka_tsuki.get_links_light_novels_by_language(soup=soup) + baka_tsuki.get_list_novels(list_novels_by_language=list_novels_by_language) + + print("--- %s seconds ---" % (time.time() - start_time)) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d779dbe..9374e29 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ bs4 -requests \ No newline at end of file +requests +tqdm +selenium \ No newline at end of file