import argparse import os from pathlib import Path import pickle import shutil import sys import requests from bs4 import BeautifulSoup class Chandl: def __init__(self, board, thread_id, output_path=None, quiet=False, delete_log=False): self.board = board self.thread_id = thread_id self.output_path = output_path if output_path else f'data/{self.thread_id}/' self.output_path_exists = os.path.exists(self.output_path) self.quiet = quiet if self.quiet: sys.stdout = open(os.devnull, 'w') self.log_path = f'logs/{thread_id}.pickle' self.delete_log = delete_log if self.delete_log: os.remove(self.log_path) self.thread_url = f'https://boards.4chan.org/{board}/thread/{thread_id}' self.cdn_url = f'https://i.4cdn.org/{self.board}' + '/{file_name}' self.downloaded_files = [] if not os.path.exists(self.log_path) else self.deserialize_log_file() self.download_count = 0 def deserialize_log_file(self): print('Reading log file...') with open(self.log_path, 'rb') as f: self.downloaded_files = pickle.load(f) return self.downloaded_files def serialize_log_file(self): print('Saving log file...') with open(self.log_path, 'wb') as f: self.downloaded_files = pickle.dump(self.downloaded_files, f) def main(self): print(f'Logs currently contain {len(self.downloaded_files)} files...') resp = self.retrieve_thread_data() self.parse_thread_data_for_image_urls(resp.content) print(f'Downloaded {self.download_count} files!') print(f'Logs currently contain {len(self.downloaded_files)} files...') self.serialize_log_file() def retrieve_thread_data(self): print('Retrieving thread data...') return requests.get(self.thread_url) def parse_thread_data_for_image_urls(self, thread_data): print('Parsing thread data...') soup = BeautifulSoup(thread_data, 'html.parser') for file_div in soup.find_all('div', attrs={'class': 'fileText'}): file_path = self.cdn_url.format(file_name=file_div.a["href"].split('/')[-1]) file_name = file_div.a.next_element if file_path not in self.downloaded_files: self.save_thread_image(file_path, file_name) self.downloaded_files.append(file_path) self.download_count += 1 def save_thread_image(self, file_path, file_name): # only create output directory if there are files to be saved if not self.output_path_exists: print('Creating output path...') Path(self.output_path).mkdir(parents=True) self.output_path_exists = True resp = requests.get(file_path, stream=True) full_file_path = f"{self.output_path}/{file_name}" print(f'Saving {file_name} to {full_file_path}...') with open(full_file_path, 'wb') as f: shutil.copyfileobj(resp.raw, f) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('board', type=str) parser.add_argument('thread_id', type=str) parser.add_argument('-o', '--output-path', dest='output_path', type=str, default=None) parser.add_argument('-q', '--quiet', dest='quiet', action='store_true', default=False) parser.add_argument('-d', '--delete-logs', dest='delete_logs', action='store_true', default=False) args = parser.parse_args() c = Chandl(args.board, args.thread_id, args.output_path, args.quiet, args.delete_logs) c.main()