-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathahref2mdlink.py
97 lines (80 loc) · 2.88 KB
/
ahref2mdlink.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import traceback
import urllib2
import urlparse
import os
import bs4
class HtmlReaderFactory(object):
@staticmethod
def get(path):
if 'http' in path:
return CurlHtmlReader(path)
else:
return FileHtmlReader(path)
class FileHtmlReader(object):
def __init__(self, path):
self._path = path
self._fd = None
def __enter__(self):
self._fd = open(self._path)
return self._fd
def __exit__(self, type, value, traceback):
if self._fd is not None:
self._fd.close()
class CurlHtmlReader(object):
def __init__(self, url):
self._url = url
self._response = None
def __enter__(self):
self._response = urllib2.urlopen(self._url)
return self._response
def __exit__(self, type, value, traceback):
pass
def download_image(url, filepath):
with open(filepath, 'wb') as fw:
fw.write(urllib2.urlopen(url).read())
def convert_url_relative_to_absolute(url, html_root):
if len(url) >= 4 and url[:4] == 'http':
return url
return html_root + url
def main():
if len(sys.argv) < 4:
print >>sys.stderr, 'usage: %s /path/to/html tag:selector /path/to/image' % sys.argv[0]
sys.exit(1)
html_url = sys.argv[1]
tag, tag_class = sys.argv[2].split(':')
image_dir = sys.argv[3]
parse_result = urlparse.urlparse(html_url)
html_root = '%s://%s' % (parse_result.scheme, parse_result.netloc)
try:
with HtmlReaderFactory.get(html_url) as fr:
soup = bs4.BeautifulSoup(fr, "html.parser")
body = None
if tag_class == '':
body = soup.find(tag)
else:
body = soup.find(tag, class_=tag_class)
for a_href in body.find_all('a'):
if len(a_href.text) == 0:
continue
text = a_href.text.encode('utf-8').strip('\n')
href = a_href.attrs['href']
print >>sys.stderr, '[debug] %s' % href
print '[%s](%s)' % (text, convert_url_relative_to_absolute(href, html_root))
for img in body.find_all('img'):
src = img.attrs['src']
filename = src.split('/')[-1]
image_url = convert_url_relative_to_absolute(src, html_root)
image_filepath = os.path.join(image_dir, filename)
image_markdown = '%s/%s' % (os.path.basename(image_dir.rstrip(os.sep)), filename)
print >>sys.stderr, '[debug] %s %s %s' % (src, image_url, image_filepath)
download_image(image_url, image_filepath)
print '' % image_markdown
except Exception as e:
print >>sys.stderr, '%s %s' % (e, traceback.format_exc())
if __name__ == '__main__':
main()