Allplus的單字

如何去爬取All plus雜誌的單字導出json

1
import requests
2
from bs4 import BeautifulSoup
3
from json import dumps
4
import re
5

6
def after_name(raw_name):
7
    name = re.sub(r'[\\/:*?"<>|]', '_', raw_name.replace('\n', '').strip())
8
    name = re.sub(r'\s+', ' ', name)
9
    return name
10

11
def crawler(page):
12
    response = requests.get(page)
13
    response.encoding = 'utf-8'
14
    soup = BeautifulSoup(response.text, 'html.parser')
15
    vocabulary_list = []
16
    sections = soup.find_all('div', class_='main-word')
17
    for section in sections:
18
        vocabulary = section.find('div', class_='word').text.strip()
19
        pos = section.find('div', class_='explain').find('span', class_='part-of-speech-tag').text.strip()
20
        chinese = section.find('div', class_='explain').text.strip().split(' ')[-1]
21
        vocabulary_list.append({
22
            "vocabulary": vocabulary,
23
            "partOfSpeech": pos,
24
            "chinese": chinese
25
        })
26
    file_name = soup.find('span', class_='small')
27
    file_name = after_name(file_name.text)
28
    return vocabulary_list, file_name
29

30
if __name__ == "__main__":
31
    base = "https://voccard.liveabc.com/allplus/2024/9/" #自行調整月份與年份
32
    day = [f"{base}{day}" for day in range(1, 20)]
33
    for page in day:
34
        vocabulary, output = crawler(page)
35
        result = {"vocabularies": vocabulary}
36
        output = f"{output}.json"
37
        with open(output, 'w', encoding="utf-8") as json_file:
38
            json_file.write(dumps(result, ensure_ascii=False, indent=4))
39
        print(f"Vocabulary data saved to '{output}'.")

Thanks for reading!

Allplus的單字

Fri Jan 03 2025

150 words · 2 minutes

Daily 不知道