1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
#!/usr/bin/python
import argparse
import os
from pathlib import Path
import pickle
import shutil
import sys
import requests
from bs4 import BeautifulSoup
class Chandl:
def __init__(self, board, thread_id, output_path=None, quiet=False, delete_log=False):
self.board = board
self.thread_id = thread_id
self.output_path = output_path if output_path else f'{str(Path.home())}/.chandl/data/{self.thread_id}/'
self.output_path_exists = os.path.exists(self.output_path)
self.quiet = quiet
if self.quiet:
sys.stdout = open(os.devnull, 'w')
self.log_path = f'{str(Path.home())}/.chandl/logs/{self.thread_id}.pickle'
self.delete_log = delete_log
if self.delete_log:
os.remove(self.log_path)
self.thread_url = f'https://boards.4chan.org/{board}/thread/{thread_id}'
self.cdn_url = f'https://i.4cdn.org/{self.board}' + '/{file_name}'
self.downloaded_files = [] if not os.path.exists(self.log_path) else self.deserialize_log_file()
self.download_count = 0
def deserialize_log_file(self):
print('Reading log file...')
with open(self.log_path, 'rb') as f:
self.downloaded_files = pickle.load(f)
return self.downloaded_files
def serialize_log_file(self):
print('Saving log file...')
Path(self.log_path).mkdir(parents=True)
with open(f'{self.log_path}{self.thread_id}.pickle', 'wb') as f:
pickle.dump(self.downloaded_files, f)
def main(self):
print(f'Logs currently contain {len(self.downloaded_files)} files...')
resp = self.retrieve_thread_data()
self.parse_thread_data_for_image_urls(resp.content)
print(f'Downloaded {self.download_count} files!')
print(f'Logs currently contain {len(self.downloaded_files)} files...')
self.serialize_log_file()
def retrieve_thread_data(self):
print('Retrieving thread data...')
return requests.get(self.thread_url)
def parse_thread_data_for_image_urls(self, thread_data):
print('Parsing thread data...')
soup = BeautifulSoup(thread_data, 'html.parser')
for file_div in soup.find_all('div', attrs={'class': 'fileText'}):
file_path = self.cdn_url.format(file_name=file_div.a["href"].split('/')[-1])
file_name = file_div.a.next_element
if file_path not in self.downloaded_files:
self.save_thread_image(file_path, file_name)
self.downloaded_files.append(file_path)
self.download_count += 1
def save_thread_image(self, file_path, file_name):
# only create output directory if there are files to be saved
if not self.output_path_exists:
print('Creating output path...')
Path(self.output_path).mkdir(parents=True)
self.output_path_exists = True
resp = requests.get(file_path, stream=True)
full_file_path = f"{self.output_path}{file_name}"
print(f'Saving {file_name} to {full_file_path}...')
with open(full_file_path, 'wb') as f:
shutil.copyfileobj(resp.raw, f)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('board', type=str)
parser.add_argument('thread_id', type=str)
parser.add_argument('-o', '--output-path', dest='output_path', type=str, default=None)
parser.add_argument('-q', '--quiet', dest='quiet', action='store_true', default=False)
parser.add_argument('-d', '--delete-logs', dest='delete_logs', action='store_true', default=False)
args = parser.parse_args()
c = Chandl(args.board, args.thread_id, args.output_path, args.quiet, args.delete_logs)
c.main()
|