import requestsfrom bs4 import BeautifulSoupimport reimport csvfrom json import dumps
urls = [ "https://www.ivy.com.tw/newsLetter/analysis_cont/2024050715515268612",]
def reclaimed(text): corrections = { '.vt.': 'vt.', '.vi.': 'vi.', '.n.': 'n.', '.a.': 'a.', '.adv.': 'adv.', '.prep.': 'prep.', '.conj.': 'conj.', '.interj.': 'interj.', '.v.': 'v.', '.adj.': 'adj.' }
for incorrect, correct in corrections.items(): text = text.replace(incorrect, correct) parts = text.split('&') results = [] for part in parts: part = part.strip() pos_pattern = r'^(vt\.|vi\.|n\.|a\.|adv\.|prep\.|conj\.|interj\.|v\.|adj\.)' pos_match = re.match(pos_pattern, part) if pos_match: pos = pos_match.group(1) chinese = part[len(pos):].strip() results.append({'pos': pos, 'chinese': chinese}) else: results.append({'pos': '', 'chinese': part.strip()}) return results
for url in urls: response = requests.get(url) html_content = response.text soup = BeautifulSoup(html_content, 'html.parser') pubDate = soup.find_all("span", style=re.compile("font-size:2.4em")) if len(pubDate) >= 2: day = pubDate[1].text.strip().zfill(2) month = pubDate[0].text.strip().zfill(2) file_pubDate = f"{month}-{day}" else: file_pubDate = "unknown"
title = soup.find("h1", class_="title") if title: file_name = title.text.strip().replace(" ", "_") file_name = re.sub(r'[<>:"/\\|?*]', '_', file_name) else: file_name = "output"
csv_file_name = f"{file_pubDate}_{file_name}.csv" json_file_name = f"{file_pubDate}_{file_name}.json"
words = [] phrases = []
words_block = soup.find("div", id="words_block") if words_block: for word_block in words_block.find_all("div", class_="my-voca-word"): word = word_block.contents[0].strip() chinese_block = word_block.find_next("div", class_="my-voca-mean-con").text.strip() meanings = reclaimed(chinese_block) for meaning in meanings: words.append({ "word": word, "pos": meaning['pos'], "chinese": meaning['chinese'] })
phrases_block = soup.find("div", id="phrases_block") if phrases_block: for phrase in phrases_block.find_all("div", class_="my-voca-word"): phrase = phrase.contents[0].strip() chinese = phrase.find("div").text.strip() phrases.append({"phrase": phrase, "pos": "phr.", "chinese": chinese})
with open(csv_file_name, mode='w', encoding='utf-8', newline='') as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(["word/phrase", "pos", "chinese"]) for word in words: csv_writer.writerow([word['word'], word['pos'], word['chinese']]) for phrase in phrases: csv_writer.writerow([phrase['phrase'], phrase['pos'], phrase['chinese']])
result = {"vocabularies": []} with open(csv_file_name, "r", encoding="utf-8") as csvfile: reader = csv.reader(csvfile) next(reader) for row in reader: dictionary = { "vocabulary": row[0], "partOfSpeech": row[1], "chinese": row[2] } result["vocabularies"].append(dictionary)
with open(json_file_name, 'w', encoding="utf-8") as file: file.write(dumps(result, ensure_ascii=False))
print(f"成功儲存{csv_file_name} 和 {json_file_name}") Thanks for reading!