I've been trying to solve this problem for a while, i have a Sqlite3 database and i am using python, what i have attempted is i have loads of titles of movies, tv shows, books etc, which i clean up using regular expressions (removing spaces & irrelevant information such as seasons and episodes).
I then take my data and group them by category, return them by matching search terms & number of matches found. Essentially if the word i have searched occurs in the database under my specified category it will return all terms which contain my orignal search term, listing them out and showing number of occurances.
I then scrape the web to find a series of IMDB codes, which i then save into a notepad file, for each group of titles which shows in my search results. My real goal with the IMDB codes is to save them into my database under every occurance of the specified title i am looking for. But i cannot and i do not know what the issue is, any help would be greatly appreciated.
import re
import time
import sqlite3
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
conn = sqlite3.connect('databasename.sqlite')
cursor = conn.cursor()
options = Options()
driver = webdriver.Chrome(options=options)
imdb_main_page = 'https://www.imdb.com/'
url = ''
current_url = url
similar_categories = ['tv', 'tv_sd', 'tv_hd', 'tv_uhd']
current_category = None
running = True
def clean_database():
try:
cursor.execute(f"UPDATE items SET title = REPLACE(title, '.', ' ')")
conn.commit()
return True
except Exception as e:
print(f'Error cleaning database: {e}')
return False
def category_selection():
global current_category
while True:
user_input = input('What category would you like? ')
cursor.execute('SELECT DISTINCT cat FROM items')
available_categories = [row[0] for row in cursor.fetchall()]
if user_input in available_categories:
current_category = user_input
print(f"Items in category '{user_input}':")
break
else:
print(f"Category '{user_input}' is not available.")
def update_database_with_imdb_codes(title_with_occurrences, current_category):
for title, occurrences in title_with_occurrences:
imdb = find_imdb_code(title)
if imdb:
update_query = f"UPDATE items SET imdb = ? WHERE title = ? AND cat = ?"
cursor.execute(update_query, (imdb, title, current_category))
conn.commit()
def user_search():
if current_category:
search_term = input(f"What are you looking for in '{current_category}'? ")
cursor.execute(f"SELECT title, COUNT(title) AS title_count FROM items WHERE title LIKE ? AND cat = ? "
"GROUP BY title ORDER BY title_count DESC, title",
('%' + search_term + '%', current_category))
else:
search_term = input("What are you looking for? ")
cursor.execute("SELECT title, COUNT(title) AS title_count FROM items WHERE title LIKE ? "
"GROUP BY title ORDER BY title_count DESC, title",
('%' + search_term + '%',))
pattern = re.compile(r'\b\w*\d\w*\b')
search_result = cursor.fetchall()
file_size = find_size(search_result)
unique_titles = set()
first_instance_list = []
if search_result:
print(f"Search results for '{search_term}':")
term_count = {}
for result, size in zip(search_result, file_size):
match = re.search(pattern, result[0])
if match:
term = result[0][:match.start()].strip()
term_count[term] = term_count.get(term, 0) + result[1]
else:
term = result[0]
term_count[term] = term_count.get(term, 0) + result[1]
if term not in unique_titles:
unique_titles.add(term)
first_instance_list.append((term, term_count[term]))
sorted_terms = sorted(term_count.items(), key=lambda x: x[1], reverse=True)
for term, count in sorted_terms:
print(f"{term}, Occurrences: {count}")
update_database_with_imdb_codes(first_instance_list, current_category)
else:
print(f"No result found for '{search_term}'.")
for title, count in first_instance_list:
find_imdb_code(title)
def extract_imdb_id_from_url(url):
pattern = r'/title/tt(\w+)/'
match = re.search(pattern, url)
if match:
imdb_id = match.group(1)
return 'Extracted IMDB ID:', imdb_id
return None
def find_imdb_code(title):
try:
driver.get(imdb_main_page)
search_box = driver.find_element(By.ID, 'suggestion-search')
search_box.clear()
search_box.send_keys(title)
time.sleep(2)
search_box.send_keys(Keys.ARROW_DOWN)
search_box.send_keys(Keys.RETURN)
driver.implicitly_wait(2)
# Adjust this class name according to the actual class on IMDb site
imdb_url = driver.current_url
print(f'Extracted IMDB URL for {title}:', imdb_url)
imdb_id = extract_imdb_id_from_url(imdb_url)
start_index = imdb_url.index('title/') + len('title/')
end_index = imdb_url.index('/', start_index)
desired_part = imdb_url[start_index:end_index]
file_path = 'extracted_imdb_code.txt'
with open(file_path, 'r') as file:
lines = file.readlines()
for line in lines:
if desired_part in line:
print(f'Duplicate IMDB code found for {title}. Skipping...')
return imdb_main_page
with open(file_path, 'a') as file:
file.write(title + ': ' + desired_part + '\n')
print(f'{title} saved to', file_path)
except Exception as e:
print(f'An error occurred: {str(e)}')
import traceback
traceback.print_exc()
return imdb_main_page
def find_size(search_result):
file_size = []
try:
titles = [result[0] for result in search_result]
placeholders = ",".join(["?" for _ in titles])
query = f"SELECT title, size FROM items WHERE title IN ({placeholders})"
cursor.execute(query, titles)
rows = cursor.fetchall()
size_by_title = {row[0]: row[1] for row in rows}
for title in titles:
if title in size_by_title:
size_value = size_by_title[title]
if size_value is None:
print(f"File size not available for '{title}'")
file_size.append(None)
else:
size_gb = size_value / 1000000000
size_mb = size_value / 1000000
size_kb = size_value / 1000
if size_gb >= 1:
file_size.append(f'{round(size_gb)}GB')
elif size_mb >= 1:
file_size.append(f'{round(size_mb)}MB')
else:
file_size.append(f'{round(size_kb)}KB')
else:
print(f"File '{title}' not found.")
file_size.append(None)
return file_size
except Exception as e:
print(f'An error occurred: {str(e)}')
return [None] * len(search_result)
def check_for_duplicates(file_path):
imdb_codes = set()
try:
with open(file_path, 'r') as file:
lines = file.readlines()
for line in lines:
parts = line.strip().split(': ')
if len(parts) == 2:
_, imdb_code = parts
imdb_codes.add(imdb_code)
return imdb_codes
except FileNotFoundError:
print(f"File '{file_path}' not found.")
return imdb_codes
while running:
if clean_database():
print('Data has been cleaned.')
category_selection()
user_search()
conn.close()
Here is some entries from my database.
8451 C1E4E91F4E915CE57E420EBC95DD7E86B49546C2 community s06e11 modern espionage 720p web-dl dd5 1 h 264-ntb[rartv] 2015-05-21 05:46:50 tv 942931968 bo1c84a
8969 CB407F1CCA49A30FBDF1940308254459BD3C6F5C community s06e11 720p webrip x264-tastetv[rartv] 2015-05-19 09:39:06 tv 781975552 yq2uv19
8970 34B1E877F296DC550CD251836892AED926840C8E community s06e11 720p webrip x264-batv[rartv] 2015-05-19 09:39:04 tv 792985600
8971 37095630D87FE409AA6F7A97377EAD31E4235974 community s06e11 1080p webrip x264-fihtv[rartv] 2015-05-19 09:17:13 tv 1763180544
10502 7C91CC0E07C5EA1C9AEB427FFC7CFD73D2B9FB50 community s06e10 basic rv repair and palmistry 1080p web-dl dd5 1 h 264-ntb[rartv] 2015-05-14 13:47:59 tv 1085014016
10503 5ABEEC0079C21AAE5EA1CA168FB6CC6FDB2AA39E community s06e10 basic rv repair and palmistry 720p web-dl dd5 1 h 264-ntb[rartv] 2015-05-14 13:46:00 tv 857735168 lv4xiao
11518 D24574A2B14ABDC58F50DBD24332644645523519 community s06e10 720p webrip x264-tastetv[rartv] 2015-05-12 09:44:57 tv 643301376 l2agjv8
12656 60644CE421A80F27B3B66750970342489058F326 dc greensburg a story of community rebuilding 13of13 back on the map 720p hdtv x264 ac3 mvgroup org mkv 2015-05-09 00:13:03 tv 1681915904 zek4dhs
12663 424566C68F0D2048D385E70ADDD4BBCE7CCA16CC dc greensburg a story of community rebuilding 12of13 storm watch 720p hdtv x264 ac3 mvgroup org mkv 2015-05-08 20:51:48 tv 1476132864 hbg8zow
12666 BDBBA91BD0417D31B0ED40C7A9597798AC79BA4D dc greensburg a story of community rebuilding 11of13 the anniversary 720p hdtv x264 ac3 mvgroup org mkv 2015-05-08 18:04:47 tv 1546387456 on5fzkr
12783 DFB227076F187A3B912C7A583A4515C1CE87997E dc greensburg a story of community rebuilding 10of13 election 720p hdtv x264 ac3 mvgroup org mkv 2015-05-08 11:28:40 tv 1536425984 15czr8a
13247 B5E72D937B9C7864EF72B9D99BAE0FFD14930219 dc greensburg a story of community rebuilding 09of13 on the road 720p hdtv x264 ac3 mvgroup org mkv 2015-05-08 01:03:54 tv 1553203200 ksqmlr2
13249 7F9EF9528519E46A73082EFBC1457358638268DE dc greensburg a story of community rebuilding 08of13 hoop dreams 720p hdtv x264 ac3 mvgroup org mkv 2015-05-07 23:10:06 tv 1549008896 jr6qdvc
13251 340FDDFEB8EAB40E02877B570FB25C6281E174C1 dc greensburg a story of community rebuilding 07of13 the primary 720p hdtv x264 ac3 mvgroup org mkv 2015-05-07 21:32:50 tv 1429733376 dcblw19
13419 D0F15CC13E4379483F4BD7146E5770C9B3DCD2F3 dc greensburg a story of community rebuilding 06of13 mini greenbuild 720p hdtv x264 ac3 mvgroup org mkv 2015-05-07 18:13:50 tv 1508376576 c56xytk
13422 A46E910EE55DF747D9A534F653DD1F36AFBE526D dc greensburg a story of community rebuilding 05of13 state of the union 720p hdtv x264 ac3 mvgroup org mkv 2015-05-07 13:11:25 tv 1593573376 crezu6n
13719 1A22A741A5D2371899D59FB5EF4BC8624909B420 dc greensburg a story of community rebuilding 04of13 ice storm 720p hdtv x264 ac3 mvgroup org mkv 2015-05-07 01:07:03 tv 1556086784 unj6kbe
13790 8F3EC81C89662402849B43E8FFE24F07933FF031 community s06e09 grifting 101 1080p web-dl dd5 1 h 264-ntb[rartv] 2015-05-07 01:46:18 tv 1179385856
13791 95871835AA630EC7D53E628150A37154A1AFB193 community s06e09 grifting 101 720p web-dl dd5 1 h 264-ntb[rartv] 2015-05-07 01:44:44 tv 954990592
13843 EB75E50EAD9239684D572B952EC2B09674EECCBB dc greensburg a story of community rebuilding 03of13 the building begins 720p hdtv x264 ac3 mvgroup org mkv 2015-05-07 00:04:42 tv 1623195648 9v3wa7i
13844 63FB3C9B5D8D6ACFDAD15165E34B017DD3DFE14D dc greensburg a story of community rebuilding 02of13 homecoming 720p hdtv x264 ac3 mvgroup org mkv 2015-05-06 22:56:27 tv 1515978752 cp32gid
13847 53BFD7373DE7CD34DF160BB59A72829B4604E7C2 dc greensburg a story of community rebuilding 01of13 the tornado 720p hdtv x264 ac3 mvgroup org mkv 2015-05-06 21:14:59 tv 1617166336 a1lbgk4
14558 5C47EDCE878838EA053AD6C890E202D0E8DD7285 community s06e09 720p webrip x264-tastetv[rartv] 2015-05-05 09:32:50 tv 746848256
14559 D7F56078720DFF938F2081AC2AD92E7EDDAEEB10 community s06e09 1080p webrip x264-fihtv[rartv] 2015-05-05 09:26:28 tv 1774714880
16383 55CDD0BEE147E4F5489FE0E6122B1A43817A46D6 community s06e08 intro to recycled cinema 1080p web-dl dd5 1 h 264-ntb[rartv] 2015-04-30 10:19:53 tv 1166540800
16384 221CEA9EBCD1C8A391E528BAC0EBDB2432C7CC47 community s06e08 intro to recycled cinema 720p web-dl dd5 1 h 264-ntb[rartv] 2015-04-30 10:18:28 tv 928514048
17267 5BB7F0D150558DDE19CDEA7763343CFDFA5B5F3A community s06e08 720p webrip x264-batv[rartv] 2015-04-28 09:52:05 tv 757334016
17268 F16BF47ACD062DB2A454DF6175B2ECC6BFE47AC6 community s06e08 1080p webrip x264-fihtv[rartv] 2015-04-28 09:28:16 tv 1751646208
18096 48DB8A165D4E00C19C39DD27E48A0059EE495C03 community s06e07 advanced safety feature 1080p web-dl dd5 1 h 264-ntb[rartv] 2015-04-25 18:22:06 tv 1169948672
18097 D51DD44BE70B081458CA1D8FAEE07E0DA3F0F538 community s06e07 advanced safety feature 720p web-dl dd5 1 h 264-ntb[rartv] 2015-04-25 18:17:39 tv 947912704
20064 14101803D6AAC2D639DA094F930E711EB11F3F77 community s06e07 720p webrip x264-tastetv[rartv] 2015-04-21 09:42:29 tv 710672384
20065 41F6F431E6BEC9C5E05CD8701C91A16F3289B4B6 community s06e07 1080p webrip x264-fihtv[rartv] 2015-04-21 09:32:30 tv 1759248384
20091 7BA453CED462A3B89719369B1702EC2B8A389C3D community s06e06 basic email security 1080p web-dl dd5 1 h 264-ntb[rartv] 2015-04-21 09:08:11 tv 1153957888
20092 4ACE0F9AA112A286CF3AA4C8B6F73053C6970C3F community s06e06 basic email security 720p web-dl dd5 1 h 264-ntb[rartv] 2015-04-21 09:06:55 tv 901775360
22892 C782B5EE82526A5250907008789E7905CAD79D49 community s06e06 720p webrip x264-batv[rartv] 2015-04-14 09:43:16 tv 707788800
and here is how it looks on my system.
