Sunday, July 15, 2012

Search Engine, Phase I: Collecting links in a seed page

1:  #A program to gather links present in a webpage and to print them.  
2:  import urllib2  
3:  pos=0  
4:  contents=0  
5:  website_1=raw_input("Enter the website")  
6:  def web_contents(website):  
7:       global contents #really important stuff.   
8:       web_page=urllib2.urlopen(website)  
9:       contents=web_page.read()  
10:       return contents # ican't call contents outside the function, i can only  
11:               #calll the function defined  
12:  web_contents(website_1)  
13:  #print contents #to access a variable outside function it should be global  
14:  #global means it should be declared before any function definition(line no:5)  
15:  #A global variable created should be edited only by using the global keyword  
16:  #http://stackoverflow.com/questions/423379/using-global-variables-in-a-function-other-than-the-one-that-created-them  
17:  def parse_contents(contents,pos):  
18:       i=0  
19:       null_pos=contents.find("<a href=")  
20:       start_pos=contents.find("http",null_pos)  
21:       end_pos=contents.find('"',start_pos)  
22:       url_to_be_passed=contents[start_pos:end_pos]  
23:       #return url_to_be_passed,end_pos  
24:       links=[]  
25:       links.append(url_to_be_passed)  
26:       #return links  
27:       url_to=[]  
28:       while start_pos!=-1:  
29:            null_pos=contents.find("<a href=",end_pos)  
30:            #print null_pos  
31:            start_pos=contents.find("http",null_pos)  
32:            #print start_pos  
33:            end_pos=contents.find('"',start_pos)  
34:            #print end_pos  
35:            #print contents[1071:1091]  
36:            url_to.append(contents[start_pos:end_pos])  
37:            #print url_to  
38:            #links.append(url_to_b_passed)  
39:            #return url_to_be_passed,end_pos  
40:            #i=i+1  
41:            #links.append(url_to)  
42:            i=i+1  
43:       links=links+url_to  
44:       for each in links:  
45:            print each  
46:  parse_contents(contents,pos)  
47:  #def extract_links(url_to_be_passed):  
48:  #     list=[]  
49:  #     list.append(url_to_be_passed)