0

I'm parsing through HTML / xml text and removing set of tags like fs, img from it using BeautifulSoup.

The document I'm parsing has <tag ... > instead of <tag ..../>

So I wanted to convert all the <tag ... > to <tag ..../> using RegEx. So basically I am using the following regex "<tag".*> to <tag/>. But it takes a lot of time on big large documents. Is there a better regex or a better way to solve this problem ?

Or using bs can I remove tags like <tag> ( without closing tag )

EDIT

Reason for this question

I want to parse a lot of documents for a deep learning project. So I'm scraping a web. I do not need <sometag /> tags from my tagset to be present in my document, so I'm using bs for removing them. Some of the documents have tag so bs cannot create a parse tree without closing tag. So I'm using regex to add a termination. But it is not efficient

My code

# My tag is fs here
content = re.sub(r"<fs.*>", "<fs/>", content)

sp = BeautifulSoup(content, "html.parser")

[s.extract() for s in sp("fs")]

Sample input:

<Sentence id='2001'>
1   ((  CCP <fs name='CCP' drel='sent-adv:VGNF'>
1.1 ಹೀಗೆ    CC__CCS <fs af='ಹೀಗೆ,avy,,,,d,0,0' name='ಹೀಗೆ'>
    ))
2   ((  NP  <fs name='NP' drel='r6:NP2'>
2.1 ಪತ್ರದ   N__NN   <fs af='ಪತ್ರ,n,,sg,3,o,ಅದ್+ಅ,ax+a' name='ಪತ್ರದ'>
    ))
3   ((  NP  <fs name='NP2' drel='k1:VGF'>
3.1 ಯಾವ DM__DMQ <fs af='ಯಾವ,pn,,,,d,0,0' name='ಯಾವ'>
3.2 ಯಾವ DM__DMQ <fs af='ಯಾವ,pn,,,,d,0,0' name='ಯಾವ2'>
3.3 ಅಂಶ N__NN   <fs af='ಅಂಶ,n,,sg,3,d,0,0' name='ಅಂಶ'>
    ))
4   ((  NP  <fs name='NP3' drel='k7p:VGF'>
4.1 ಎಲ್ಲೆಲ್ಲಿ   PR__PRQ <fs af='ಎಲ್ಲಿ,pn,,,,o,ಎಲ್ಲಿ,eVlli' name='ಎಲ್ಲೆಲ್ಲಿ'>
    ))
5   ((  VGF <fs af=',,,,,,,' name='VGF' drel='ccof:CCP2' stype='imperative' voicetype='active'>
5.1 ಇರಬೇಕು  V__VM__VF   <fs af='ಇರು,v,,,,,ಅ+ಬೇಕು,a+beku' name='ಇರಬೇಕು'>
    ))
6   ((  CCP <fs name='CCP2' drel='k2:VGNF'>
6.1 ಎಂಬುದನ್ನು   CC__CCS <fs af='ಎಂಬುದು,avy,,,,,ಅ+ಅನ್ನು,a+annu' name='ಎಂಬುದನ್ನು'>
    ))
7   ((  VGNF    <fs name='VGNF' drel='rh:VGF2'>
7.1 ಗೊತ್ತುಪಡಿಸುವುದರಿಂದ  V__VM__VNF  <fs af='ಗೊತ್ತುಪಡಿಸು,v,,,,,ಉವ್+ಉ+ಉದ್+ಅ+ಅರ್+ಇ+ಇಂದ,uv+u+ux+a+ar+i+iMxa' name='ಗೊತ್ತುಪಡಿಸುವುದರಿಂದ'>
    ))
8   ((  NP  <fs name='NP4' drel='r6:NP5'>
8.1 ಇದರ PR__PRP <fs af='ಇದು,pn,n,sg,3,o,ಅ+ಅರ್+ಅ,a+ar+a' name='ಇದರ'>
    ))
9   ((  NP  <fs name='NP5' drel='k1:VGF2'>
9.1 ಲೇಖಕ    N__NN   <fs af='ಲೇಖಕ,n,m,sg,3,d,0,0' name='ಲೇಖಕ'>
    ))
10  ((  NP  <fs name='NP6' drel='k1s:VGF2'>
10.1    ಅತಿಯಾದ  JJ  <fs af='ಅತಿ,adj,,sg,3,o,ಇಯ್+ಆ+ಆಗು+ದ್+ಅ,iy+A+Agu+x+a' name='ಅತಿಯಾದ'>
10.2    ಸಂಪ್ರದಾಯ    N__NN   <fs af='ಸಂಪ್ರದಾಯ,n,,sg,3,d,0,0' name='ಸಂಪ್ರದಾಯ'>
10.3    ಶರಣ N__NN   <fs af='ಶರಣ,n,,sg,3,d,0,0' name='ಶರಣ'>
    ))
11  ((  VGF <fs af=',,,,,,,' name='VGF2' drel='ccof:CCP3' stype='imperative' voicetype='active'>
11.1    ಆಗಬೇಕು  V__VM__VF   <fs af='ಆಗು,v,,,,,ಅ+ಬೇಕು,a+beku' name='ಆಗಬೇಕು'>
    ))
12  ((  CCP <fs name='CCP3' drel='vmod:NULL__VGF'>
12.1    ಎಂದು    CC__CCS <fs af='ಎಂದು,avy,,,,d,0,0' name='ಎಂದು'>
    ))
13  ((  NEGP    <fs af=',,,,,,,' name='NEGP' drel='pof:NULL__VGF'>
13.1    ಅಲ್ಲ    RP__NEG <fs af='ಅಲ್ಲ,avy,,,,d,0,0' name='ಅಲ್ಲ'>
    ))
14  ((  NULL__VGF   <fs af=',,,,,,,' name='NULL__VGF' stype='declarative' voicetype='active'>
14.1    NULL    V__VM__VF   <fs af=',,,,,,,' troot='' mtype='non-gap' name='NULL'>
    ))
15  ((  BLK <fs name='BLK' drel='rsym-eos:NULL__VGF'>
15.1    .   RD__PUNC    <fs af='.,punc,,,,,,' name='.'>
    ))
</Sentence>

<Sentence id='2002'>
1   ((  NP  <fs name='NP' drel='k2:VGNF'>
1.1 ಸ್ಥೂಲವಾದ    JJ  <fs af='ಸ್ಥೂಲ,adj,,sg,3,o,ಅವ್+ಆ+ಆಗು+ದ್+ಅ,av+A+Agu+x+a' name='ಸ್ಥೂಲವಾದ'>
1.2 ಈ   DM__DMD <fs af='ಈ,pn,,,,d,0,0' name='ಈ'>
1.3 ಕಟ್ಟುಗಳನ್ನು N__NN   <fs af='ಕಟ್ಟು,n,,pl,3,o,ಗಳು+ಅ+ಅನ್ನು,galYu+a+annu' name='ಕಟ್ಟುಗಳನ್ನು'>
    ))
2   ((  VGNF    <fs name='VGNF' drel='nmod:VGF'>
2.1 ಅನುಸರಿಸಿಯೂ  V__VM__VNF  <fs af='ಅನುಸಾರ,v,,,,,ಇ+ಇಸ್+ಉ,i+is+u' droot1='ಅನುಸರಿಸು' dcat1='v' dcase1='o' dcm1='ಇ' dsuff1='i' droot2='ಅನುಸರಿಸಿ' dcat2='v' dcase2='o' dcm2='ಇಯ್+ಊ' dsuff2='iy+U' droot3='ಅನುಸರಿಸಿಯೂ' dcat3='v' dcase3='d' dcm3='0' dsuff3='0' name='ಅನುಸರಿಸಿಯೂ'>
    ))
3   ((  NP  <fs name='NP2' drel='k1:VGF'>
3.1 ಆತ  PR__PRP <fs af='ಆತ,pn,m,sg,3,d,0,0' name='ಆತ'>
    ))
4   ((  NP  <fs name='NP3' drel='r6:NP4'>
4.1 ತನ್ನ    PR__PRF <fs af='ತಾನು,pn,any,sg,2,o,ನ್ನ್+ಅ,nn+a' name='ತನ್ನ'>
    ))
5   ((  NP  <fs name='NP4' drel='k2:VGF'>
5.1 ವೈಶಿಷ್ಟ್ಯವನ್ನು  N__NN   <fs af='ವೈಶಿಷ್ಟ್ಯ,n,,sg,3,o,ಅವ್+ಅ+ಅನ್ನು,av+a+annu' name='ವೈಶಿಷ್ಟ್ಯವನ್ನು'>
    ))
6   ((  NP  <fs name='NP5' drel='k7:VGF'>
6.1 ಅದರಲ್ಲಿ PR__PRP <fs af='ಅದು,pn,n,sg,3,o,ಅ+ಅರ್+ಅ+ಅಲ್ಲಿ,a+ar+a+alli' name='ಅದರಲ್ಲಿ'>
    ))
7   ((  VGF <fs af=',,,,,,,' name='VGF' stype='declarative' voicetype='active'>
7.1 ತೋರ್ಪಡಿಸಬಹುದು   V__VM__VF   <fs af='ತೋರ್ಪಡೆ,v,,,,,ಇ+ಇಸ್+ಉ,i+is+u' droot1='ತೋರ್ಪಡಿಸು' dcat1='v' dcase1='o' dcm1='ಅ+ಬಹುದು' dsuff1='a+bahuxu' droot2='ತೋರ್ಪಡಿಸಬಹುದು' dcat2='v' dcase2='d' dcm2='0' dsuff2='0' name='ತೋರ್ಪಡಿಸಬಹುದು'>
    ))
8   ((  BLK <fs name='BLK' drel='rsym_eos:VGF'>
8.1 .   RD__PUNC    <fs af='.,punc,,,,,,' name='.'>
    ))
</Sentence>

<Sentence id='2003'>
1   ((  NP  <fs name='NP' drel='r6v:VGNF'>
1.1 ಪದ್ಯಕ್ಕೆ    N__NN   <fs af='ಪದ್ಯ,n,,sg,3,o,ಕ್ಕೆ,kkeV' name='ಪದ್ಯಕ್ಕೆ'>
    ))
2   ((  NP  <fs name='NP2' drel='r6:NP3'>
2.1 ಛಂದಸ್ಸಿನ    N__NN   <fs af='ಛಂದಸ್ಸು,n,,sg,3,o,ಇ+ಇನ್+ಅ,i+in+a' name='ಛಂದಸ್ಸಿನ'>
    ))
3   ((  NP  <fs name='NP3' drel='k1:VGNF'>
3.1 ಕಟ್ಟು   N__NN   <fs af='ಕಟ್ಟು,n,,sg,3,d,0,0' name='ಕಟ್ಟು'>
    ))
4   ((  VGNF    <fs name='VGNF' drel='vmod:VGNF5'>
4.1 ಇದ್ದರೂ  V__VM__VNF  <fs af='ಇರು,v,,,,,ದ್ದ್+ಅ+ಅರ್+ಊ,xx+a+ar+U' name='ಇದ್ದರೂ'>
    ))
5   ((  JJP <fs af=',,,,,,,' name='JJP' drel='nmod:NP4'>
5.1 ಅದರಲ್ಲೂ JJ  <fs af='ಅದು,adj,,sg,3,o,ಅ+ಅರ್+ಅ+ಅಲ್ಲಿ+ಊ,a+ar+a+alli+U' name='ಅದರಲ್ಲೂ'>
    ))
6   ((  NP  <fs name='NP4' drel='k2:VGNF2'>
6.1 ಕಾವ್ಯ   N__NN   <fs af='ಕಾವ್ಯ,n,,sg,3,d,0,0' name='ಕಾವ್ಯ'>
6.2 ಗುಣವನ್ನು    N__NN   <fs af='ಗುಣ,n,,sg,3,o,ಅವ್+ಅ+ಅನ್ನು,av+a+annu' name='ಗುಣವನ್ನು'>
    ))
7   ((  NP  <fs name='NP5' drel='k1:VGNF2'>
7.1 ಒಬ್ಬ    N__NN   <fs af='ಒಬ್ಬ,n,m,sg,3,d,0,0' name='ಒಬ್ಬ'>
7.2 ಉತ್ತಮ   JJ  <fs af='ಉತ್ತಮ,adj,,,,d,0,0' name='ಉತ್ತಮ'>
7.3 ಕವಿ N__NN   <fs af='ಕವಿ,n,,sg,3,d,0,0' name='ಕವಿ'>
    ))
8   ((  RBP <fs af=',,,,,,,' name='RBP' drel='adv:VGNF2'>
8.1 ಹೇಗೆ    RB  <fs af='ಹೇಗೆ,adv,,,,,,' name='ಹೇಗೆ'>
    ))
9   ((  VGNF    <fs af=',,,,,,,' name='VGNF2' drel='nmod__relc:RBP2'>
9.1 ಪ್ರಕಟಿಸಬಲ್ಲನೋ   V__VM__VNF  <fs af='ಪ್ರಕಟ,v,,,,,ಇ+ಇಸ್+ಉ,i+is+u' droot1='ಪ್ರಕಟಿಸು' dcat1='v' dcase1='o' dcm1='ಅ+ಬಲ್ಲ+ಅನ್+ಓ' dsuff1='a+balla+an+o' droot2='ಪ್ರಕಟಿಸಬಲ್ಲನೋ' dcat2='v' dg2='m' dn2='sg' dp2='3' dcase2='d' dcm2='0' dsuff2='0' name='ಪ್ರಕಟಿಸಬಲ್ಲನೋ'>
    ))
10  ((  RBP <fs af=',,,,,,,' name='RBP2' drel='pof:VGNF5'>
10.1    ಹಾಗೆ    RB  <fs af='ಹಾಗೆ,adv,,,,,,' name='ಹಾಗೆ'>
    ))
11  ((  NP  <fs name='NP6' drel='k4:VGNF3'>
11.1    ಪತ್ರಕ್ಕೆ    N__NN   <fs af='ಪತ್ರ,n,,sg,3,o,ಕ್ಕೆ,kkeV' name='ಪತ್ರಕ್ಕೆ'>
    ))
12  ((  NP  <fs name='NP7' drel='k1:VGNF3'>
12.1    ಒಂದು    QT__QTC <fs af='ಒಂದು,num,,sg,3,d,0,0' name='ಒಂದು'>
12.2    ಗೊತ್ತಾದ JJ  <fs af='ಗೊತ್ತು,adj,,,,o,ಆ+ಆಗು+ದ್+ಅ,A+Agu+x+a' name='ಗೊತ್ತಾದ'>
12.3    ಮಾದರಿ   N__NN   <fs af='ಮಾದರಿ,n,,sg,3,d,0,0' name='ಮಾದರಿ'>
    ))
13  ((  VGNF    <fs name='VGNF3' drel='vmod:VGNF4'>
13.1    ಇದ್ದರೂ  V__VM__VNF  <fs af='ಇರು,v,,,,,ದ್ದ್+ಅ+ಅರ್+ಊ,xx+a+ar+U' name='ಇದ್ದರೂ2'>
    ))
14  ((  JJP <fs af=',,,,,,,' name='JJP2' drel='nmod:NP8'>
14.1    ಅದರಲ್ಲೂ JJ  <fs af='ಅದು,adj,,sg,3,o,ಅ+ಅರ್+ಅ+ಅಲ್ಲಿ+ಊ,a+ar+a+alli+U' name='ಅದರಲ್ಲೂ2'>
    ))
15  ((  NP  <fs name='NP8' drel='k1:VGNF4'>
15.1    ಲೇಖಕ    N__NN   <fs af='ಲೇಖಕ,n,m,sg,3,d,0,0' name='ಲೇಖಕ'>
    ))
16  ((  NP  <fs name='NP9' drel='r6:NP10'>
16.1    ತನ್ನ    PR__PRF <fs af='ತಾನು,pn,any,sg,2,o,ನ್ನ್+ಅ,nn+a' name='ತನ್ನ'>
    ))
17  ((  NP  <fs name='NP10' drel='k2:VGNF4'>
17.1    ವೈಯಕ್ತಿಕ    JJ  <fs af='ವೈಯಕ್ತಿಕ,adj,,sg,3,d,0,0' name='ವೈಯಕ್ತಿಕ'>
17.2    ಪ್ರಭಾವವನ್ನು N__NN   <fs af='ಪ್ರಭಾವ,n,,sg,3,o,ಅವ್+ಅ+ಅನ್ನು,av+a+annu' name='ಪ್ರಭಾವವನ್ನು'>
    ))
18  ((  VGNF    <fs name='VGNF4' drel='vmod:VGNF5'>
18.1    ಬೀರಿ    V__VM__VNF  <fs af='ಬೀರು,v,,,,,ಇ,i' name='ಬೀರಿ'>
    ))
19  ((  NP  <fs name='NP11' drel='k1:VGNF5'>
19.1    ಅದು PR__PRP <fs af='ಅದು,pn,n,sg,3,d,0,0' name='ಅದು'>
    ))
20  ((  NP  <fs name='NP12' drel='k1s:VGNF5'>
20.1    ಪರಿಣಾಮಕಾರಿ  N__NN   <fs af='ಪರಿಣಾಮಕಾರಿ,n,,sg,3,d,0,0' name='ಪರಿಣಾಮಕಾರಿ'>
    ))
21  ((  VGNF    <fs name='VGNF5' drel='vmod:VGF'>
21.1    ಆಗುವಂತೆ V__VM__VNF  <fs af='ಆಗು,v,,,,,ಉವ್+ಅ+ಅಂತೆ,uv+a+aMweV' name='ಆಗುವಂತೆ'>
    ))
22  ((  VGF <fs af=',,,,,,,' name='VGF' stype='declarative' voicetype='active'>
22.1    ಮಾಡಬಹುದು    V__VM__VF   <fs af='ಮಾಡು,v,,,,,ಅ+ಬಹುದು,a+bahuxu' name='ಮಾಡಬಹುದು'>
    ))
23  ((  BLK <fs name='BLK' drel='rsym_eos:VGF'>
23.1    .   RD__PUNC    <fs af='.,punc,,,,,,' name='.'>
    ))
</Sentence>
15
  • 1
    Try replacing <img(.*?)> with <img$1/>. Commented Apr 14, 2019 at 9:50
  • What i did works Commented Apr 14, 2019 at 9:50
  • So what's your question? Commented Apr 14, 2019 at 9:51
  • 2
    1. In HTML unlike in nowadays rately used XHTML you don't need the closing tag. 2. Don't try to parse HTML with regular expressions. Commented Apr 14, 2019 at 9:51
  • I want a more efficient way Commented Apr 14, 2019 at 9:51

2 Answers 2

1

You can remove all <img> tags by using BeautifulSoup:

import requests
import bs4

response = requests.get('https://stackoverflow.com/questions/55673916/python-string-replace-html-tags')
content = response.content

soup = bs4.BeautifulSoup(content, 'html.parser')
imgs = soup.find_all('img')

for img in imgs:
    img.extract()

# Print without <img> tags
print(str(soup))

This works for me with both <img ... /> and <img ...> tags.

Sign up to request clarification or add additional context in comments.

2 Comments

similar to img there are some more tags like fs etc, but bs adds its own terminator in the end. So when you do extract most of the information is lost
I'll upvote your answer, but it is not the solution I'm looking for
0

So this will need to be solved in steps because it's not clear what exactly you're asking for.

I fall squarely in the frustrated by BS4 crowd and prefer to use regex situationally, although I recognize that some disagree.

If you're just trying to add closing tags to the opening tags, and the form of your document is relatively consistently formatted, based on your example, try this

>>> import re


>>> string = '''1   ((  CCP <fs name='CCP' drel='sent-adv:VGNF'>
1.1 ಹೀಗೆ    CC__CCS <fs af='ಹೀಗೆ,avy,,,,d,0,0' name='ಹೀಗೆ'>
    ))
2   ((  NP  <fs name='NP' drel='r6:NP2'>
2.1 ಪತ್ರದ   N__NN   <fs af='ಪತ್ರ,n,,sg,3,o,ಅದ್+ಅ,ax+a' name='ಪತ್ರದ'>
    ))
3   ((  NP  <fs name='NP2' drel='k1:VGF'>
3.1 ಯಾವ DM__DMQ <fs af='ಯಾವ,pn,,,,d,0,0' name='ಯಾವ'>
3.2 ಯಾವ DM__DMQ <fs af='ಯಾವ,pn,,,,d,0,0' name='ಯಾವ2'>
3.3 ಅಂಶ N__NN   <fs af='ಅಂಶ,n,,sg,3,d,0,0' name='ಅಂಶ'>'''


>>> print(re.sub(r'(<fs[\S\s]+?\'>)', r'\1</fs>', string))


#Ouput
1   ((  CCP <fs name='CCP' drel='sent-adv:VGNF'></fs>
1.1 ಹೀಗೆ    CC__CCS <fs af='ಹೀಗೆ,avy,,,,d,0,0' name='ಹೀಗೆ'></fs>
    ))
2   ((  NP  <fs name='NP' drel='r6:NP2'></fs>
2.1 ಪತ್ರದ   N__NN   <fs af='ಪತ್ರ,n,,sg,3,o,ಅದ್+ಅ,ax+a' name='ಪತ್ರದ'></fs>
    ))
3   ((  NP  <fs name='NP2' drel='k1:VGF'></fs>
3.1 ಯಾವ DM__DMQ <fs af='ಯಾವ,pn,,,,d,0,0' name='ಯಾವ'></fs>
3.2 ಯಾವ DM__DMQ <fs af='ಯಾವ,pn,,,,d,0,0' name='ಯಾವ2'></fs>
3.3 ಅಂಶ N__NN   <fs af='ಅಂಶ,n,,sg,3,d,0,0' name='ಅಂಶ'></fs>

And if you're trying to remove them

>>> print(re.sub(r'(<fs[\S\s]+?\'>)', r'</fs>', string))


#OUTPUT
1   ((  CCP </fs>
1.1 ಹೀಗೆ    CC__CCS </fs>
    ))
2   ((  NP  </fs>
2.1 ಪತ್ರದ   N__NN   </fs>
    ))
3   ((  NP  </fs>
3.1 ಯಾವ DM__DMQ </fs>
3.2 ಯಾವ DM__DMQ </fs>
3.3 ಅಂಶ N__NN   </fs>

If you just want to "CONVERT" the opening tags to closing tags

>>> print(re.sub(r'(<)(fs)(?=[\S\s]+?\'>)', r'\1/\2', string))


#OUTPUT
1   ((  CCP </fs name='CCP' drel='sent-adv:VGNF'>
1.1 ಹೀಗೆ    CC__CCS </fs af='ಹೀಗೆ,avy,,,,d,0,0' name='ಹೀಗೆ'>
    ))
2   ((  NP  </fs name='NP' drel='r6:NP2'>
2.1 ಪತ್ರದ   N__NN   </fs af='ಪತ್ರ,n,,sg,3,o,ಅದ್+ಅ,ax+a' name='ಪತ್ರದ'>
    ))
3   ((  NP  </fs name='NP2' drel='k1:VGF'>
3.1 ಯಾವ DM__DMQ </fs af='ಯಾವ,pn,,,,d,0,0' name='ಯಾವ'>
3.2 ಯಾವ DM__DMQ </fs af='ಯಾವ,pn,,,,d,0,0' name='ಯಾವ2'>
3.3 ಅಂಶ N__NN   </fs af='ಅಂಶ,n,,sg,3,d,0,0' name='ಅಂಶ'>

Or, more akin to your sample

>>> print(re.sub(r'(<fs[\S\s]+?)(\'>)', r'\1/\2', string))


#OUTPUT
1   ((  CCP <fs name='CCP' drel='sent-adv:VGNF/'>
1.1 ಹೀಗೆ    CC__CCS <fs af='ಹೀಗೆ,avy,,,,d,0,0' name='ಹೀಗೆ/'>
    ))
2   ((  NP  <fs name='NP' drel='r6:NP2/'>
2.1 ಪತ್ರದ   N__NN   <fs af='ಪತ್ರ,n,,sg,3,o,ಅದ್+ಅ,ax+a' name='ಪತ್ರದ/'>
    ))
3   ((  NP  <fs name='NP2' drel='k1:VGF/'>
3.1 ಯಾವ DM__DMQ <fs af='ಯಾವ,pn,,,,d,0,0' name='ಯಾವ/'>
3.2 ಯಾವ DM__DMQ <fs af='ಯಾವ,pn,,,,d,0,0' name='ಯಾವ2/'>
3.3 ಅಂಶ N__NN   <fs af='ಅಂಶ,n,,sg,3,d,0,0' name='ಅಂಶ/'>

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.