Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update branch #1

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
__pycache__/
test.py
.venv
.venv
backlog.html
*.json
.vscode/
geckodriver.log
4 changes: 2 additions & 2 deletions baka_tsuki/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .getter import get_soup
from .searcher import get_links_light_novels_by_language
from .getter import get_soup, get_list_novels
from .searcher import get_links_light_novels_by_language
167 changes: 160 additions & 7 deletions baka_tsuki/getter.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,169 @@
import requests
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

def get_soup():

baka = 'https://www.baka-tsuki.org/project/index.php?title=Category:Light_novel'
'''
progress_bar = tqdm(total=len(list_novels_by_language),colour='blue',desc=f'updating {language}')
'''

def get_soup(link):

response = requests.get(baka)
response = requests.get(link)
if response.content:
content = response.content
else:
raise Exception("Sorry, bad request")
raise Exception(f"Sorry, bad request status_code: {0}".format(response))

soup = BeautifulSoup(content, 'html.parser')

return soup
return soup

def get_list_novels(list_novels_by_language:list[dict]):
domain = 'https://www.baka-tsuki.org'

def parse_title_novel(title:str, language:str):
title = title.replace('~','')
title = title.replace(language,'')
title = title.strip()

return title

def get_novel_details(link_novel:str):
soup = get_soup(link_novel)
content = soup.find('div', attrs={'id':'mw-content-text'})

def get_main_cover(link_novel:str):
options = Options()
options.add_argument("--headless")
browser = webdriver.Firefox(options=options)
browser.get(link_novel)
html = browser.page_source
browser.close()

# html = link_novel

soup = BeautifulSoup(html, 'html.parser')
image = soup.find('img', attrs={'class':'thumbimage'})
try:
image = image['src']
image = domain+str(image)
except:
image = 'Unknown'
return image

def get_synopsis(content:BeautifulSoup):
synopsis = content.find_all(lambda tag: tag.name == 'p'
and len(list(tag.contents)) == 1
and tag.string != None)
desc = ''
for i in synopsis:
desc += i.text.encode('utf-8')
synopsis = desc.strip()

return synopsis

def get_ilustrations(chapters_links:list[str]):
img_list = []

def is_ilustration_page(page:str):
if domain in page:
soup = get_soup(page)
try:
galery = soup.find('ul', attrs={'class':'gallery mw-gallery-traditional'})
if galery is not None:
return True
except:
return False
return False

for chapter in chapters_links:
if is_ilustration_page(chapter):
galery = soup.find('ul', attrs={'class':'gallery mw-gallery-traditional'})
imgs = galery.find_all('img')
for img in imgs:
img_list.append(img['src'])

return img_list


def get_novel_chapthers(content:BeautifulSoup):
chapters = []

def is_valid_chapter(tag):
links = tag.find_all('a')
for link in links:
if domain not in link['href']:
return False
return True

dl_list = content.find_all('dl')
dl_list = list(filter(is_valid_chapter, dl_list))

for tag in dl_list:
links = tag.find_all('a')
for link in links:
chapters.append(link['href'])

return chapters

synopsis = get_synopsis(content)
cover = get_main_cover(link_novel)
chapters_links = get_novel_chapthers(content)
image_list = get_ilustrations(chapters_links)

return synopsis, cover, chapters_links, image_list

def get_list_of_novels_by_language(soup:BeautifulSoup, language:str):

list_novels_by_language = []
div_links_novels = soup.find('div', attrs={'class':'mw-content-ltr'})
divs_novels_by_letter = div_links_novels.find_all('div', attrs={'class':'mw-category-group'})

for div in divs_novels_by_letter:
category_letter = div.find('h3').text.strip().lower()
list_novels_by_category = div.find_all('a')
for tag_a in list_novels_by_category:
link = domain+tag_a['href']
title = tag_a.text
title = parse_title_novel(title, language)
synopsis, cover, chapters_links, image_list = get_novel_details(link)
list_novels_by_language.append(
{
'category_letter':category_letter,
'title':title,
'link':link,
'synopsis':synopsis,
'cover':cover,
'chapters_links':chapters_links,
'image_list':image_list
}
)

return list_novels_by_language

def get_novels_with_language(list_novels_by_language:list[dict]):

list_novels = []
for dct_novel in list_novels_by_language:
language = dct_novel['language']
link = dct_novel['link']

link_novels_by_language = domain + link
soup = get_soup(link_novels_by_language)
list_novels_by_language = get_list_of_novels_by_language(soup, language)
list_novels.append({
'language':language,
'list_novels_by_language':list_novels_by_language
})

return list_novels

def create_json_novel(list_novels:list[dict]):
with open('./.novels.json', 'w') as file:
json.dump(list_novels, file, indent=4, )

list_novels = get_novels_with_language(list_novels_by_language)
create_json_novel(list_novels)
5 changes: 4 additions & 1 deletion baka_tsuki/searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ def create_list_novels_by_language(links_novels_by_language:list[BeautifulSoup])
language = parse_language_in_string(tag_a.text)

list_novels_by_language.append(
{language:link}
{
'language':language,
'link':link
}
)

return list_novels_by_language
Expand Down
12 changes: 10 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
import baka_tsuki
import time

category = 'https://www.baka-tsuki.org/project/index.php?title=Category:Light_novel'

if __name__ == '__main__':
soup = baka_tsuki.get_soup()
list_novels_by_language = baka_tsuki.get_links_light_novels_by_language(soup)
start_time = time.time()

soup = baka_tsuki.get_soup(category)
list_novels_by_language = baka_tsuki.get_links_light_novels_by_language(soup=soup)
baka_tsuki.get_list_novels(list_novels_by_language=list_novels_by_language)

print("--- %s seconds ---" % (time.time() - start_time))
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
bs4
requests
requests
tqdm
selenium