From 393370abf7a1029e3d2526d2a9691a160e0248ab Mon Sep 17 00:00:00 2001 From: Dominic DiTaranto Date: Sun, 28 Jul 2024 15:25:42 -0400 Subject: adding script --- chandl.py | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 7 ++++ 2 files changed, 110 insertions(+) create mode 100644 chandl.py create mode 100644 requirements.txt diff --git a/chandl.py b/chandl.py new file mode 100644 index 0000000..d963378 --- /dev/null +++ b/chandl.py @@ -0,0 +1,103 @@ +import argparse +import os +from pathlib import Path +import pickle +import shutil +import sys + +import requests +from bs4 import BeautifulSoup + + +class Chandl: + def __init__(self, board, thread_id, output_path=None, quiet=False, delete_log=False): + self.board = board + self.thread_id = thread_id + + self.output_path = output_path if output_path else f'data/{self.thread_id}/' + self.output_path_exists = os.path.exists(self.output_path) + + self.quiet = quiet + if self.quiet: + sys.stdout = open(os.devnull, 'w') + + self.log_path = f'logs/{thread_id}.pickle' + + self.delete_log = delete_log + if self.delete_log: + os.remove(self.log_path) + + self.thread_url = f'https://boards.4chan.org/{board}/thread/{thread_id}' + self.cdn_url = f'https://i.4cdn.org/{self.board}' + '/{file_name}' + + self.downloaded_files = [] if not os.path.exists(self.log_path) else self.deserialize_log_file() + + self.download_count = 0 + + def deserialize_log_file(self): + print('Reading log file...') + with open(self.log_path, 'rb') as f: + self.downloaded_files = pickle.load(f) + return self.downloaded_files + + def serialize_log_file(self): + print('Saving log file...') + with open(self.log_path, 'wb') as f: + self.downloaded_files = pickle.dump(self.downloaded_files, f) + + def main(self): + print(f'Logs currently contain {len(self.downloaded_files)} files...') + + resp = self.retrieve_thread_data() + + self.parse_thread_data_for_image_urls(resp.content) + + print(f'Downloaded {self.download_count} files!') + print(f'Logs currently contain {len(self.downloaded_files)} files...') + + self.serialize_log_file() + + def retrieve_thread_data(self): + print('Retrieving thread data...') + return requests.get(self.thread_url) + + def parse_thread_data_for_image_urls(self, thread_data): + print('Parsing thread data...') + soup = BeautifulSoup(thread_data, 'html.parser') + for file_div in soup.find_all('div', attrs={'class': 'fileText'}): + file_path = self.cdn_url.format(file_name=file_div.a["href"].split('/')[-1]) + file_name = file_div.a.next_element + + if file_path not in self.downloaded_files: + self.save_thread_image(file_path, file_name) + self.downloaded_files.append(file_path) + self.download_count += 1 + + def save_thread_image(self, file_path, file_name): + # only create output directory if there are files to be saved + if not self.output_path_exists: + print('Creating output path...') + Path(self.output_path).mkdir(parents=True) + self.output_path_exists = True + + resp = requests.get(file_path, stream=True) + + full_file_path = f"{self.output_path}/{file_name}" + + print(f'Saving {file_name} to {full_file_path}...') + with open(full_file_path, 'wb') as f: + shutil.copyfileobj(resp.raw, f) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('board', type=str) + parser.add_argument('thread_id', type=str) + parser.add_argument('-o', '--output-path', dest='output_path', type=str, default=None) + parser.add_argument('-q', '--quiet', dest='quiet', action='store_true', default=False) + parser.add_argument('-d', '--delete-logs', dest='delete_logs', action='store_true', default=False) + + args = parser.parse_args() + + c = Chandl(args.board, args.thread_id, args.output_path, args.quiet, args.delete_logs) + c.main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1c78b9c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +beautifulsoup4==4.12.3 +certifi==2024.7.4 +charset-normalizer==3.3.2 +idna==3.7 +requests==2.32.3 +soupsieve==2.5 +urllib3==2.2.2 -- cgit v1.2.3-70-g09d2