MSAADA: SCRAPPING WEBSITE DIRECTORIES

Singida ndio home

JF-Expert Member
Feb 23, 2012
3,192
3,353
Nimehangaika sana na hii ishu kwa leo,

kuna website ya vichekesho inaitwa ackyshar.com

shida yangu ninachotaka ni scrape i links za directoriy ya vichekesho

vichelesho vipo kwenye directory hii

"www.ackyshine.com/vichekesho"

links zake ni kama

"www.ackyshine.com/vichekesho:ajira-ngumu"
"www.ackyshine.com/vichekesho:bangi-hizi"
"www.ackyshine.com/vichekesho:masai-katisha"
"www.ackyshine.com/vichekesho:udocta-noma"
"www.ackyshine.com/vichekesho-bomba:vichekeshobongo"

cc:
Chief-Mkwawa
Thefreedom
Rootadmin
Mwl.RCT
 
Nilitengeneza Hii JANA BUT Nimeona bado ina bugs kua hazi scrape jokes zote ila I'm not sure kama ulitaka hii but ilikuwa raha and challenging so
HAVE FUN WITH IT

Python:
import os
import getpass
import requests
import urllib.request
from bs4 import BeautifulSoup


def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)


username = getpass.getuser()
directory = f"C:\\Users\\{username}\\Desktop\\er40r"
createFolder(directory)
os.chdir(directory)

url = "http://www.ackyshine.com/vichekesho:_home"
html = requests.get(url)
soup = BeautifulSoup(html.text, "html.parser")

try:
    html.raise_for_status()
except Exception as exc:
    print('There was a problem: %s' % (exc))


def mchanganyiko(lists):
    items = lists[0].select('.list-pages-item')
    try:
        for i in range(21):
            link = items[i].select('a')
            visit = 'http://www.ackyshine.com' + link[0].get('href')
            html = requests.get(visit)
            soup = BeautifulSoup(html.text, "html.parser")

            try:
                html.raise_for_status()
            except Exception as exc:
                print('There was a problem: %s' % (exc))

            header = soup.select('span strong')
            paragraph = soup.select('p')
            with open(f"mchanganyiko{i}.txt", "w+", encoding="utf-8") as word:
                word.write(f'{header[2].getText()}\n')
            word.close()
            with open(f'mchanganyiko{i}.txt', "a", encoding="utf-8") as word:
                word.write(f'\n{paragraph[5].getText()}\n')
            word.close()
            with open(f'mchanganyiko{i}.txt', "a", encoding="utf-8") as word:
                word.write(f'\n{paragraph[6].getText()}\n')
            word.close()
    except Exception as exc:
        print('Report if error is printed. # mchanganyiko')


def mipya(lists):
    items = lists[1].select('.list-pages-item')
    try:
        for i in range(21):
            link = items[i].select('a')
            visit = 'http://www.ackyshine.com' + link[0].get('href')
            html = requests.get(visit)
            soup = BeautifulSoup(html.text, "html.parser")

            try:
                html.raise_for_status()
            except Exception as exc:
                print('There was a problem: %s' % (exc))

            header = soup.select('span strong')
            paragraph = soup.select('p')
            with open(f"mipya{i}.txt", "w+", encoding="utf-8") as word:
                word.write(f'{header[2].getText()}\n')
            word.close()
            with open(f'mipya{i}.txt', "a", encoding="utf-8") as word:
                word.write(f'\n{paragraph[5].getText()}\n')
            word.close()
            with open(f'mipya{i}.txt', "a", encoding="utf-8") as word:
                word.write(f'\n{paragraph[6].getText()}\n')
            word.close()
    except Exception as exc:
        print("Report if error is printed. #mipya")


def specifics(url):
    html = requests.get(url)
    soup = BeautifulSoup(html.text, "html.parser")

    try:
        html.raise_for_status()
    except Exception as exc:
        print('There was a problem: %s' % (exc))

    header = soup.select('span strong')
    paragraph = soup.select('p')

    with open("specifics.txt", "w+", encoding="utf-8") as word:
        word.write(f'{header[2].getText()}\n')
    word.close()
    with open('specifics.txt', "a", encoding="utf-8") as word:
        word.write(f'\n{paragraph[5].getText()}\n')
    word.close()
    with open('specifics.txt', "a", encoding="utf-8") as word:
        word.write(f'\n{paragraph[6].getText()}\n')
    word.close()


lists = soup.select('.col-sm-6')

specifics("http://www.ackyshine.com/vichekesho:hawa-wanaume-wanaopenda-wanawake-hovyo-barabarani")
mchanganyiko(lists)
mipya(lists)
 

Attachments

  • mapIt.rar
    920 bytes · Views: 5
Nilitengeneza Hii JANA BUT Nimeona bado ina bugs kua hazi scrape jokes zote ila I'm not sure kama ulitaka hii but ilikuwa raha and challenging so
HAVE FUN WITH IT

Python:
import os
import getpass
import requests
import urllib.request
from bs4 import BeautifulSoup

def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)

username = getpass.getuser()
directory = f"C:\\Users\\{username}\\Desktop\\er40r"
createFolder(directory)
os.chdir(directory)

url = "http://www.ackyshine.com/vichekesho:_home"
html = requests.get(url)
soup = BeautifulSoup(html.text, "html.parser")

try:
    html.raise_for_status()
except Exception as exc:
    print('There was a problem: %s' % (exc))


def mchanganyiko(lists):
    items = lists[0].select('.list-pages-item')
    try:
        for i in range(23):
            link = items[i].select('a')
            visit = 'http://www.ackyshine.com' + link[0].get('href')
            html = requests.get(visit)
            soup = BeautifulSoup(html.text, "html.parser")
         
            try:
                html.raise_for_status()
            except Exception as exc:
                 print('There was a problem: %s' % (exc))

            header = soup.select('span strong')
            try:
                for each in range(10):
                    if len(header[each].text) > 40:
                        word = open(f"mchanganyiko{i}.txt","w+")
                        word.write(f'{header[each].getText()}\n')
                        word.close()
                        break
                    else:
                        word = open(f"mchanganyiko{i}.txt","w+")
                        word.write(f'{header[each].getText()}\n')
                        word.close()
                        break
            except Exception as exc:
                word = open(f"mchanganyiko{i}.txt","w+")
                word.write(f'{header[each].getText()}\n')
                word.close()

            paragraph = soup.select('p')
            try:
                for num in range(10):
                    if len(paragraph[num].text) > 100:
                        word = open(f"mchanganyiko{i}.txt","w+")
                        word.write(f'\n{paragraph[num].getText()}\n')
                        word.close()
                        break
                    else:
                        print('retrying')
            except Exception as exc:
                print("Reduce number of range.1")
    except Exception as exc:
        print('Reduce number of range.2')


def mipya(lists):
    items = lists[1].select('.list-pages-item')
    try:
        for i in range(20):
            link = items[i].select('a')
            visit = 'http://www.ackyshine.com' + link[0].get('href')
            html = requests.get(visit)
            soup = BeautifulSoup(html.text, "html.parser")
         
            try:
                html.raise_for_status()
            except Exception as exc:
                 print('There was a problem: %s' % (exc))

            header = soup.select('span strong')
            try:
                for each in range(10):
                    if len(header[each].text) > 40:
                        word = open(f"mipya{i}.txt","w+")
                        word.write(f'{header[each].getText()}\n')
                        word.close()
                        break
                    else:
                        word = open(f"mipya{i}.txt","w+")
                        word.write(f'{header[each].getText()}\n')
                        word.close()
                        break
            except Exception as exc:
                word = open(f"mipya{i}.txt","w+")
                word.write(f'{header[each].getText()}\n')

            paragraph = soup.select('p')
            try:
                for num in range(10):
                    if len(paragraph[num].text) > 100:
                        word = open(f"mipya{i}.txt","w+")
                        word.write(f'\n{paragraph[num].getText()}\n')
                        word.close()
                        break
                    else:
                        print('retrying...')
            except Exception as exc:
                print("Reduce number of range (#paragraph1).")
    except Exception as exc:
        print("Reduce number of range (#paragraph2).")


lists = soup.select('.col-sm-6')

mchanganyiko(lists)
mipya(lists)
Mkuu hapo kwenye
url = "http://www.ackyshine.com/vichekesho:_home"
weka
url = "http://www.ackyshine.com/vichekesho:"
 

Kitu Hichohicho (angalia address bar) na pia hio ina fanya yafuatayo:
  • Ina anza hapo homepage ankuchukua table mbli.
    • Mchanganyiko
    • Mipya
  • ina chukua lila link ya mchanganyko na kisha kuingia ndani ya link na ku copy HTML tag ya <p> isyopungua manano 100 (unaweza badilisha ukitaka) na <strong> ya kichwa cha habari isiyo pungua 40 na SAME Process kwenye mipya pia
  • Alaf inasave zote kama 43 hivi in a .(txt) file on desktop ndani ya folder ya er40r ndani ya sec 10
NA FOR SPECIFIC LINKS
copy url unayotaka alaf ingia and call a function ya "specifics" na paste link as arguments
so itakuwa something like:

Python:
specifics('http://www.ackyshine.com/vichekesho:kweli-mitandao-imeharibu-watu-angalia-huyu-anacho')

Usisahau quotations ("")
NA PIA THE CODE IS UPDATED FROM 100 BUGS TO ZERO{0} BUGS

# PROJECT COMPLETE 1Kb @ file
 
Back
Top Bottom