I have searched in stackoverflow but haven't found any answer.
I have written a script in python to get data from this website.
https://resources.allsetlearning.com/chinese/grammar/Reduplication_of_adjectives
The page have two-three sentence structure and 4-5 example. For ex:-
Structure 1
- Example 1
- Example 2Structure 2
- Example 1
- Example 2Structure 3
- Example 1
- Example 2
- Example 3
I managed to get all sentence structure and example sentence but how can I get example sentence for structure 1 , structure 2 , structure 3 separately. Also how not to get wrong sentences.
from selenium import webdriver
import time
driver = webdriver.Chrome(r"C:\Users\<user>\Documents\chromedriver\chromedriver.exe") # change it
save_file = open("export.txt", "w", encoding="utf8")
wrong_link_file = open("link_with_wrong.txt", "w", encoding="utf8")
url = "https://resources.allsetlearning.com/chinese/grammar/Reduplication_of_adjectives"
time.sleep(1)
driver.get(url)
time.sleep(3)
#jiegou = driver.find_element_by_xpath("/html/body/section/div[3]/div[4]/div[2]/div/div/div[2]/h1")
jiegou = driver.find_elements_by_class_name("jiegou")
usedfor = driver.find_element_by_xpath("//*[@id='ibox']/ul/li[6]/div[2]")
heading = driver.find_element_by_xpath("//*[@id='innerbodycontent']/div/div[2]/h1")
sen = driver.find_elements_by_class_name("spaced")
wrong = driver.find_elements_by_class_name("x")
# if page contain wrong sentence
found = False
if len(wrong) > 0:
found = True
print("..............Found..............." + url)
for j in jiegou:
jiegou_str = ":: " + j.text + " ::"
print(jiegou_str)
save_file.write(jiegou_str)
print("\n.........................................................\n")
save_file.write("\n\n")
st_sen=""
for s in sen:
st_sen = str(s.text)
if len(wrong) > 0 and wrong[0].text in st_sen:
continue
if "。" in st_sen :
sep = "。"
st_sen = st_sen.split(sep,1)[0].strip()
st_sen += " " + sep
if "?" in st_sen:
sep = "?"
st_sen = st_sen.split(sep,1)[0].strip()
st_sen += " " + sep
all_set = st_sen +"\t"+ jiegou_str +"\t"+ usedfor.text +"\t"+ heading.text + "\t" + url
print(all_set)
save_file.write(all_set)
print("\n\n")
save_file.write("\n\n")