常春藤雜誌的單字

1
import requests
2
from bs4 import BeautifulSoup
3
import re
4
import csv
5
from json import dumps
6

7
urls = [
8
    "https://www.ivy.com.tw/newsLetter/analysis_cont/2024050715515268612",
9
]
10

11
def reclaimed(text):
12
    corrections = {
13
        '.vt.': 'vt.',
14
        '.vi.': 'vi.',
15
        '.n.': 'n.',
16
        '.a.': 'a.',
17
        '.adv.': 'adv.',
18
        '.prep.': 'prep.',
19
        '.conj.': 'conj.',
20
        '.interj.': 'interj.',
21
        '.v.': 'v.',
22
        '.adj.': 'adj.'
23
    }
24

25

26
    for incorrect, correct in corrections.items():
27
        text = text.replace(incorrect, correct)
28
    parts = text.split('&')
29
    results = []
30
    for part in parts:
31
        part = part.strip()
32
        pos_pattern = r'^(vt\.|vi\.|n\.|a\.|adv\.|prep\.|conj\.|interj\.|v\.|adj\.)'
33
        pos_match = re.match(pos_pattern, part)
34
        if pos_match:
35
            pos = pos_match.group(1)
36
            chinese = part[len(pos):].strip()
37
            results.append({'pos': pos, 'chinese': chinese})
38
        else:
39
            results.append({'pos': '', 'chinese': part.strip()})
40
    return results
41

42

43
for url in urls:
44
    response = requests.get(url)
45
    html_content = response.text
46
    soup = BeautifulSoup(html_content, 'html.parser')
47
    pubDate = soup.find_all("span", style=re.compile("font-size:2.4em"))
48
    if len(pubDate) >= 2:
49
        day = pubDate[1].text.strip().zfill(2)
50
        month = pubDate[0].text.strip().zfill(2)
51
        file_pubDate = f"{month}-{day}"
52
    else:
53
        file_pubDate = "unknown"
54

55
    title = soup.find("h1", class_="title")
56
    if title:
57
        file_name = title.text.strip().replace(" ", "_")
58
        file_name = re.sub(r'[<>:"/\\|?*]', '_', file_name)
59
    else:
60
        file_name = "output"
61

62
    csv_file_name = f"{file_pubDate}_{file_name}.csv"
63
    json_file_name = f"{file_pubDate}_{file_name}.json"
64

65
    words = []
66
    phrases = []
67

68
    words_block = soup.find("div", id="words_block")
69
    if words_block:
70
        for word_block in words_block.find_all("div", class_="my-voca-word"):
71
            word = word_block.contents[0].strip()
72
            chinese_block = word_block.find_next("div", class_="my-voca-mean-con").text.strip()
73
            meanings = reclaimed(chinese_block)
74
            for meaning in meanings:
75
                words.append({
76
                    "word": word,
77
                    "pos": meaning['pos'],
78
                    "chinese": meaning['chinese']
79
                })
80

81
    phrases_block = soup.find("div", id="phrases_block")
82
    if phrases_block:
83
        for phrase in phrases_block.find_all("div", class_="my-voca-word"):
84
            phrase = phrase.contents[0].strip()
85
            chinese = phrase.find("div").text.strip()
86
            phrases.append({"phrase": phrase, "pos": "phr.", "chinese": chinese})
87

88
    with open(csv_file_name, mode='w', encoding='utf-8', newline='') as csv_file:
89
        csv_writer = csv.writer(csv_file)
90
        csv_writer.writerow(["word/phrase", "pos", "chinese"])
91
        for word in words:
92
            csv_writer.writerow([word['word'], word['pos'], word['chinese']])
93
        for phrase in phrases:
94
            csv_writer.writerow([phrase['phrase'], phrase['pos'], phrase['chinese']])
95

96
    result = {"vocabularies": []}
97
    with open(csv_file_name, "r", encoding="utf-8") as csvfile:
98
        reader = csv.reader(csvfile)
99
        next(reader)
100
        for row in reader:
101
            dictionary = {
102
                "vocabulary": row[0],
103
                "partOfSpeech": row[1],
104
                "chinese": row[2]
105
            }
106
            result["vocabularies"].append(dictionary)
107

108
    with open(json_file_name, 'w', encoding="utf-8") as file:
109
        file.write(dumps(result, ensure_ascii=False))
110

111
    print(f"成功儲存{csv_file_name} 和 {json_file_name}")
Thanks for reading!
常春藤雜誌的單字

Fri Jan 03 2025
288 words · 4 minutes
Daily 不知道