常春藤雜誌的單字

明倫單字卡只差常春藤的單字

Fri Jan 03 2025
288 words · 4 minutes
import requests
from bs4 import BeautifulSoup
import re
import csv
from json import dumps
urls = [
"https://www.ivy.com.tw/newsLetter/analysis_cont/2024050715515268612",
]
def reclaimed(text):
corrections = {
'.vt.': 'vt.',
'.vi.': 'vi.',
'.n.': 'n.',
'.a.': 'a.',
'.adv.': 'adv.',
'.prep.': 'prep.',
'.conj.': 'conj.',
'.interj.': 'interj.',
'.v.': 'v.',
'.adj.': 'adj.'
}
for incorrect, correct in corrections.items():
text = text.replace(incorrect, correct)
parts = text.split('&')
results = []
for part in parts:
part = part.strip()
pos_pattern = r'^(vt\.|vi\.|n\.|a\.|adv\.|prep\.|conj\.|interj\.|v\.|adj\.)'
pos_match = re.match(pos_pattern, part)
if pos_match:
pos = pos_match.group(1)
chinese = part[len(pos):].strip()
results.append({'pos': pos, 'chinese': chinese})
else:
results.append({'pos': '', 'chinese': part.strip()})
return results
for url in urls:
response = requests.get(url)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
pubDate = soup.find_all("span", style=re.compile("font-size:2.4em"))
if len(pubDate) >= 2:
day = pubDate[1].text.strip().zfill(2)
month = pubDate[0].text.strip().zfill(2)
file_pubDate = f"{month}-{day}"
else:
file_pubDate = "unknown"
title = soup.find("h1", class_="title")
if title:
file_name = title.text.strip().replace(" ", "_")
file_name = re.sub(r'[<>:"/\\|?*]', '_', file_name)
else:
file_name = "output"
csv_file_name = f"{file_pubDate}_{file_name}.csv"
json_file_name = f"{file_pubDate}_{file_name}.json"
words = []
phrases = []
words_block = soup.find("div", id="words_block")
if words_block:
for word_block in words_block.find_all("div", class_="my-voca-word"):
word = word_block.contents[0].strip()
chinese_block = word_block.find_next("div", class_="my-voca-mean-con").text.strip()
meanings = reclaimed(chinese_block)
for meaning in meanings:
words.append({
"word": word,
"pos": meaning['pos'],
"chinese": meaning['chinese']
})
phrases_block = soup.find("div", id="phrases_block")
if phrases_block:
for phrase in phrases_block.find_all("div", class_="my-voca-word"):
phrase = phrase.contents[0].strip()
chinese = phrase.find("div").text.strip()
phrases.append({"phrase": phrase, "pos": "phr.", "chinese": chinese})
with open(csv_file_name, mode='w', encoding='utf-8', newline='') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["word/phrase", "pos", "chinese"])
for word in words:
csv_writer.writerow([word['word'], word['pos'], word['chinese']])
for phrase in phrases:
csv_writer.writerow([phrase['phrase'], phrase['pos'], phrase['chinese']])
result = {"vocabularies": []}
with open(csv_file_name, "r", encoding="utf-8") as csvfile:
reader = csv.reader(csvfile)
next(reader)
for row in reader:
dictionary = {
"vocabulary": row[0],
"partOfSpeech": row[1],
"chinese": row[2]
}
result["vocabularies"].append(dictionary)
with open(json_file_name, 'w', encoding="utf-8") as file:
file.write(dumps(result, ensure_ascii=False))
print(f"成功儲存{csv_file_name}{json_file_name}")

Thanks for reading!

常春藤雜誌的單字

Fri Jan 03 2025
288 words · 4 minutes