0

I've been trying to solve this problem for a while, i have a Sqlite3 database and i am using python, what i have attempted is i have loads of titles of movies, tv shows, books etc, which i clean up using regular expressions (removing spaces & irrelevant information such as seasons and episodes).

I then take my data and group them by category, return them by matching search terms & number of matches found. Essentially if the word i have searched occurs in the database under my specified category it will return all terms which contain my orignal search term, listing them out and showing number of occurances.

I then scrape the web to find a series of IMDB codes, which i then save into a notepad file, for each group of titles which shows in my search results. My real goal with the IMDB codes is to save them into my database under every occurance of the specified title i am looking for. But i cannot and i do not know what the issue is, any help would be greatly appreciated.

import re
import time
import sqlite3

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options


conn = sqlite3.connect('databasename.sqlite')
cursor = conn.cursor()

options = Options()
driver = webdriver.Chrome(options=options)

imdb_main_page = 'https://www.imdb.com/'

url = ''
current_url = url

similar_categories = ['tv', 'tv_sd', 'tv_hd', 'tv_uhd']
current_category = None

running = True


def clean_database():
    try:
        cursor.execute(f"UPDATE items SET title = REPLACE(title, '.', ' ')")
        conn.commit()
        return True
    except Exception as e:
        print(f'Error cleaning database: {e}')
        return False


def category_selection():
    global current_category

    while True:
        user_input = input('What category would you like? ')

        cursor.execute('SELECT DISTINCT cat FROM items')
        available_categories = [row[0] for row in cursor.fetchall()]

        if user_input in available_categories:
            current_category = user_input
            print(f"Items in category '{user_input}':")
            break
        else:
            print(f"Category '{user_input}' is not available.")


def update_database_with_imdb_codes(title_with_occurrences, current_category):
    for title, occurrences in title_with_occurrences:
        imdb = find_imdb_code(title)
        if imdb:
            update_query = f"UPDATE items SET imdb = ? WHERE title = ? AND cat = ?"
            cursor.execute(update_query, (imdb, title, current_category))
            conn.commit()


def user_search():
    if current_category:
        search_term = input(f"What are you looking for in '{current_category}'? ")
        cursor.execute(f"SELECT title, COUNT(title) AS title_count FROM items WHERE title LIKE ? AND cat = ? "
                        "GROUP BY title ORDER BY title_count DESC, title",
                       ('%' + search_term + '%', current_category))
    else:
        search_term = input("What are you looking for? ")
        cursor.execute("SELECT title, COUNT(title) AS title_count FROM items WHERE title LIKE ? "
                       "GROUP BY title ORDER BY title_count DESC, title",
                       ('%' + search_term + '%',))

    pattern = re.compile(r'\b\w*\d\w*\b')

    search_result = cursor.fetchall()
    file_size = find_size(search_result)
    unique_titles = set()

    first_instance_list = []

    if search_result:
        print(f"Search results for '{search_term}':")
        term_count = {}

        for result, size in zip(search_result, file_size):
            match = re.search(pattern, result[0])
            if match:
                term = result[0][:match.start()].strip()
                term_count[term] = term_count.get(term, 0) + result[1]
            else:
                term = result[0]
                term_count[term] = term_count.get(term, 0) + result[1]

            if term not in unique_titles:
                unique_titles.add(term)

                first_instance_list.append((term, term_count[term]))

        sorted_terms = sorted(term_count.items(), key=lambda x: x[1], reverse=True)

        for term, count in sorted_terms:
            print(f"{term}, Occurrences: {count}")

        update_database_with_imdb_codes(first_instance_list, current_category)
    else:
        print(f"No result found for '{search_term}'.")

    for title, count in first_instance_list:
        find_imdb_code(title)


def extract_imdb_id_from_url(url):
    pattern = r'/title/tt(\w+)/'
    match = re.search(pattern, url)
    if match:
        imdb_id = match.group(1)
        return 'Extracted IMDB ID:', imdb_id

    return None


def find_imdb_code(title):
    try:
        driver.get(imdb_main_page)

        search_box = driver.find_element(By.ID, 'suggestion-search')
        search_box.clear()
        search_box.send_keys(title)

        time.sleep(2)

        search_box.send_keys(Keys.ARROW_DOWN)
        search_box.send_keys(Keys.RETURN)

        driver.implicitly_wait(2)

        # Adjust this class name according to the actual class on IMDb site
        imdb_url = driver.current_url
        print(f'Extracted IMDB URL for {title}:', imdb_url)

        imdb_id = extract_imdb_id_from_url(imdb_url)

        start_index = imdb_url.index('title/') + len('title/')
        end_index = imdb_url.index('/', start_index)
        desired_part = imdb_url[start_index:end_index]

        file_path = 'extracted_imdb_code.txt'

        with open(file_path, 'r') as file:
            lines = file.readlines()
            for line in lines:
                if desired_part in line:
                    print(f'Duplicate IMDB code found for {title}. Skipping...')
                    return imdb_main_page

        with open(file_path, 'a') as file:
            file.write(title + ': ' + desired_part + '\n')

        print(f'{title} saved to', file_path)

    except Exception as e:
        print(f'An error occurred: {str(e)}')
        import traceback
        traceback.print_exc()

    return imdb_main_page


def find_size(search_result):
    file_size = []

    try:
        titles = [result[0] for result in search_result]
        placeholders = ",".join(["?" for _ in titles])

        query = f"SELECT title, size FROM items WHERE title IN ({placeholders})"
        cursor.execute(query, titles)
        rows = cursor.fetchall()

        size_by_title = {row[0]: row[1] for row in rows}

        for title in titles:
            if title in size_by_title:
                size_value = size_by_title[title]
                if size_value is None:
                    print(f"File size not available for '{title}'")
                    file_size.append(None)
                else:
                    size_gb = size_value / 1000000000
                    size_mb = size_value / 1000000
                    size_kb = size_value / 1000

                    if size_gb >= 1:
                        file_size.append(f'{round(size_gb)}GB')
                    elif size_mb >= 1:
                        file_size.append(f'{round(size_mb)}MB')
                    else:
                        file_size.append(f'{round(size_kb)}KB')

            else:
                print(f"File '{title}' not found.")
                file_size.append(None)

        return file_size

    except Exception as e:
        print(f'An error occurred: {str(e)}')
        return [None] * len(search_result)


def check_for_duplicates(file_path):
    imdb_codes = set()

    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()
            for line in lines:
                parts = line.strip().split(': ')
                if len(parts) == 2:
                    _, imdb_code = parts
                    imdb_codes.add(imdb_code)

        return imdb_codes
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
        return imdb_codes


while running:
    if clean_database():
        print('Data has been cleaned.')
        category_selection()
        user_search()

conn.close()

Here is some entries from my database.

8451    C1E4E91F4E915CE57E420EBC95DD7E86B49546C2    community s06e11 modern espionage 720p web-dl dd5 1 h 264-ntb[rartv]    2015-05-21 05:46:50 tv  942931968   bo1c84a 
8969    CB407F1CCA49A30FBDF1940308254459BD3C6F5C    community s06e11 720p webrip x264-tastetv[rartv]    2015-05-19 09:39:06 tv  781975552   yq2uv19 
8970    34B1E877F296DC550CD251836892AED926840C8E    community s06e11 720p webrip x264-batv[rartv]   2015-05-19 09:39:04 tv  792985600       
8971    37095630D87FE409AA6F7A97377EAD31E4235974    community s06e11 1080p webrip x264-fihtv[rartv] 2015-05-19 09:17:13 tv  1763180544      
10502   7C91CC0E07C5EA1C9AEB427FFC7CFD73D2B9FB50    community s06e10 basic rv repair and palmistry 1080p web-dl dd5 1 h 264-ntb[rartv]  2015-05-14 13:47:59 tv  1085014016      
10503   5ABEEC0079C21AAE5EA1CA168FB6CC6FDB2AA39E    community s06e10 basic rv repair and palmistry 720p web-dl dd5 1 h 264-ntb[rartv]   2015-05-14 13:46:00 tv  857735168   lv4xiao 
11518   D24574A2B14ABDC58F50DBD24332644645523519    community s06e10 720p webrip x264-tastetv[rartv]    2015-05-12 09:44:57 tv  643301376   l2agjv8 
12656   60644CE421A80F27B3B66750970342489058F326    dc greensburg a story of community rebuilding 13of13 back on the map 720p hdtv x264 ac3 mvgroup org mkv 2015-05-09 00:13:03 tv  1681915904  zek4dhs 
12663   424566C68F0D2048D385E70ADDD4BBCE7CCA16CC    dc greensburg a story of community rebuilding 12of13 storm watch 720p hdtv x264 ac3 mvgroup org mkv 2015-05-08 20:51:48 tv  1476132864  hbg8zow 
12666   BDBBA91BD0417D31B0ED40C7A9597798AC79BA4D    dc greensburg a story of community rebuilding 11of13 the anniversary 720p hdtv x264 ac3 mvgroup org mkv 2015-05-08 18:04:47 tv  1546387456  on5fzkr 
12783   DFB227076F187A3B912C7A583A4515C1CE87997E    dc greensburg a story of community rebuilding 10of13 election 720p hdtv x264 ac3 mvgroup org mkv    2015-05-08 11:28:40 tv  1536425984  15czr8a 
13247   B5E72D937B9C7864EF72B9D99BAE0FFD14930219    dc greensburg a story of community rebuilding 09of13 on the road 720p hdtv x264 ac3 mvgroup org mkv 2015-05-08 01:03:54 tv  1553203200  ksqmlr2 
13249   7F9EF9528519E46A73082EFBC1457358638268DE    dc greensburg a story of community rebuilding 08of13 hoop dreams 720p hdtv x264 ac3 mvgroup org mkv 2015-05-07 23:10:06 tv  1549008896  jr6qdvc 
13251   340FDDFEB8EAB40E02877B570FB25C6281E174C1    dc greensburg a story of community rebuilding 07of13 the primary 720p hdtv x264 ac3 mvgroup org mkv 2015-05-07 21:32:50 tv  1429733376  dcblw19 
13419   D0F15CC13E4379483F4BD7146E5770C9B3DCD2F3    dc greensburg a story of community rebuilding 06of13 mini greenbuild 720p hdtv x264 ac3 mvgroup org mkv 2015-05-07 18:13:50 tv  1508376576  c56xytk 
13422   A46E910EE55DF747D9A534F653DD1F36AFBE526D    dc greensburg a story of community rebuilding 05of13 state of the union 720p hdtv x264 ac3 mvgroup org mkv  2015-05-07 13:11:25 tv  1593573376  crezu6n 
13719   1A22A741A5D2371899D59FB5EF4BC8624909B420    dc greensburg a story of community rebuilding 04of13 ice storm 720p hdtv x264 ac3 mvgroup org mkv   2015-05-07 01:07:03 tv  1556086784  unj6kbe 
13790   8F3EC81C89662402849B43E8FFE24F07933FF031    community s06e09 grifting 101 1080p web-dl dd5 1 h 264-ntb[rartv]   2015-05-07 01:46:18 tv  1179385856      
13791   95871835AA630EC7D53E628150A37154A1AFB193    community s06e09 grifting 101 720p web-dl dd5 1 h 264-ntb[rartv]    2015-05-07 01:44:44 tv  954990592       
13843   EB75E50EAD9239684D572B952EC2B09674EECCBB    dc greensburg a story of community rebuilding 03of13 the building begins 720p hdtv x264 ac3 mvgroup org mkv 2015-05-07 00:04:42 tv  1623195648  9v3wa7i 
13844   63FB3C9B5D8D6ACFDAD15165E34B017DD3DFE14D    dc greensburg a story of community rebuilding 02of13 homecoming 720p hdtv x264 ac3 mvgroup org mkv  2015-05-06 22:56:27 tv  1515978752  cp32gid 
13847   53BFD7373DE7CD34DF160BB59A72829B4604E7C2    dc greensburg a story of community rebuilding 01of13 the tornado 720p hdtv x264 ac3 mvgroup org mkv 2015-05-06 21:14:59 tv  1617166336  a1lbgk4 
14558   5C47EDCE878838EA053AD6C890E202D0E8DD7285    community s06e09 720p webrip x264-tastetv[rartv]    2015-05-05 09:32:50 tv  746848256       
14559   D7F56078720DFF938F2081AC2AD92E7EDDAEEB10    community s06e09 1080p webrip x264-fihtv[rartv] 2015-05-05 09:26:28 tv  1774714880      
16383   55CDD0BEE147E4F5489FE0E6122B1A43817A46D6    community s06e08 intro to recycled cinema 1080p web-dl dd5 1 h 264-ntb[rartv]   2015-04-30 10:19:53 tv  1166540800      
16384   221CEA9EBCD1C8A391E528BAC0EBDB2432C7CC47    community s06e08 intro to recycled cinema 720p web-dl dd5 1 h 264-ntb[rartv]    2015-04-30 10:18:28 tv  928514048       
17267   5BB7F0D150558DDE19CDEA7763343CFDFA5B5F3A    community s06e08 720p webrip x264-batv[rartv]   2015-04-28 09:52:05 tv  757334016       
17268   F16BF47ACD062DB2A454DF6175B2ECC6BFE47AC6    community s06e08 1080p webrip x264-fihtv[rartv] 2015-04-28 09:28:16 tv  1751646208      
18096   48DB8A165D4E00C19C39DD27E48A0059EE495C03    community s06e07 advanced safety feature 1080p web-dl dd5 1 h 264-ntb[rartv]    2015-04-25 18:22:06 tv  1169948672      
18097   D51DD44BE70B081458CA1D8FAEE07E0DA3F0F538    community s06e07 advanced safety feature 720p web-dl dd5 1 h 264-ntb[rartv] 2015-04-25 18:17:39 tv  947912704       
20064   14101803D6AAC2D639DA094F930E711EB11F3F77    community s06e07 720p webrip x264-tastetv[rartv]    2015-04-21 09:42:29 tv  710672384       
20065   41F6F431E6BEC9C5E05CD8701C91A16F3289B4B6    community s06e07 1080p webrip x264-fihtv[rartv] 2015-04-21 09:32:30 tv  1759248384      
20091   7BA453CED462A3B89719369B1702EC2B8A389C3D    community s06e06 basic email security 1080p web-dl dd5 1 h 264-ntb[rartv]   2015-04-21 09:08:11 tv  1153957888      
20092   4ACE0F9AA112A286CF3AA4C8B6F73053C6970C3F    community s06e06 basic email security 720p web-dl dd5 1 h 264-ntb[rartv]    2015-04-21 09:06:55 tv  901775360       
22892   C782B5EE82526A5250907008789E7905CAD79D49    community s06e06 720p webrip x264-batv[rartv]   2015-04-14 09:43:16 tv  707788800   

and here is how it looks on my system.

Database Image

10
  • Please post the error you are experiencing. Commented Jan 2, 2024 at 22:09
  • I do not get any errors, it runs as if everything is working correctly but when i refresh my database the column for imdb codes does not update. Commented Jan 3, 2024 at 0:52
  • Please attach a print of update_query for update_database_with_imdb_codes method Commented Jan 3, 2024 at 1:04
  • 1
    these is using sqlite3 not mysql Commented Jan 3, 2024 at 1:11
  • Please post an output of .schema in sqlite3 so I can reproduce the table structure Commented Jan 3, 2024 at 1:13

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.