chandl.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106

#!/usr/bin/python

import argparse
import os
from pathlib import Path
import pickle
import shutil
import sys

import requests
from bs4 import BeautifulSoup


class Chandl:
    def __init__(self, board, thread_id, output_path=None, quiet=False, delete_log=False):
        self.board = board
        self.thread_id = thread_id

        self.output_path = output_path if output_path else f'{str(Path.home())}/.chandl/data/{self.thread_id}/'
        self.output_path_exists = os.path.exists(self.output_path)

        self.quiet = quiet
        if self.quiet:
            sys.stdout = open(os.devnull, 'w')

        self.log_path = f'{str(Path.home())}/.chandl/logs/{self.thread_id}.pickle'

        self.delete_log = delete_log
        if self.delete_log:
            os.remove(self.log_path)

        self.thread_url = f'https://boards.4chan.org/{board}/thread/{thread_id}'
        self.cdn_url = f'https://i.4cdn.org/{self.board}' + '/{file_name}'

        self.downloaded_files = [] if not os.path.exists(self.log_path) else self.deserialize_log_file()

        self.download_count = 0

    def deserialize_log_file(self):
        print('Reading log file...')
        with open(self.log_path, 'rb') as f:
            self.downloaded_files = pickle.load(f)
        return self.downloaded_files

    def serialize_log_file(self):
        print('Saving log file...')
        Path(self.log_path).mkdir(parents=True)
        with open(f'{self.log_path}{self.thread_id}.pickle', 'wb') as f:
           pickle.dump(self.downloaded_files, f)

    def main(self):
        print(f'Logs currently contain {len(self.downloaded_files)} files...')

        resp = self.retrieve_thread_data()

        self.parse_thread_data_for_image_urls(resp.content)

        print(f'Downloaded {self.download_count} files!')
        print(f'Logs currently contain {len(self.downloaded_files)} files...')

        self.serialize_log_file()

    def retrieve_thread_data(self):
        print('Retrieving thread data...')
        return requests.get(self.thread_url)

    def parse_thread_data_for_image_urls(self, thread_data):
        print('Parsing thread data...')
        soup = BeautifulSoup(thread_data, 'html.parser')
        for file_div in soup.find_all('div', attrs={'class': 'fileText'}):
            file_path = self.cdn_url.format(file_name=file_div.a["href"].split('/')[-1])
            file_name = file_div.a.next_element

            if file_path not in self.downloaded_files:
                self.save_thread_image(file_path, file_name)
                self.downloaded_files.append(file_path)
                self.download_count += 1

    def save_thread_image(self, file_path, file_name):
        # only create output directory if there are files to be saved
        if not self.output_path_exists:
            print('Creating output path...')
            Path(self.output_path).mkdir(parents=True)
            self.output_path_exists = True

        resp = requests.get(file_path, stream=True)

        full_file_path = f"{self.output_path}{file_name}"

        print(f'Saving {file_name} to {full_file_path}...')
        with open(full_file_path, 'wb') as f:
            shutil.copyfileobj(resp.raw, f)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('board', type=str)
    parser.add_argument('thread_id', type=str)
    parser.add_argument('-o', '--output-path', dest='output_path', type=str, default=None)
    parser.add_argument('-q', '--quiet', dest='quiet', action='store_true', default=False)
    parser.add_argument('-d', '--delete-logs', dest='delete_logs', action='store_true', default=False)

    args = parser.parse_args()

    c = Chandl(args.board, args.thread_id, args.output_path, args.quiet, args.delete_logs)
    c.main()